options(scipen = 999)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(stringr)
## Warning: package 'stringr' was built under R version 4.5.3
library(readr)
## Warning: package 'readr' was built under R version 4.5.3
tmdb_5000_movies <- read_csv(
"C:/Users/mkuma/Downloads/archive(2)/tmdb_5000_movies.csv",locale = locale(encoding = "UTF-8"))
## Rows: 4803 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): genres, homepage, keywords, original_language, original_title, ov...
## dbl (7): budget, id, popularity, revenue, runtime, vote_average, vote_count
## date (1): release_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colSums(is.na(tmdb_5000_movies))
## budget genres homepage
## 0 0 3091
## id keywords original_language
## 0 0 0
## original_title overview popularity
## 0 4 0
## production_companies production_countries release_date
## 0 0 1
## revenue runtime spoken_languages
## 0 2 0
## status tagline title
## 0 844 0
## vote_average vote_count
## 0 0
movies_clean <- tmdb_5000_movies %>%
filter(!is.na(runtime),
!is.na(release_date)) %>%
mutate(
year = year(as.Date(release_date)),
main_genre = str_extract(genres, '"name":\\s*"[^"]+"'),
main_genre = str_remove_all(main_genre, '"name":\\s*"|\\"'),
main_country = str_extract(production_countries, '"name":\\s*"[^"]+"'),
main_country = str_remove_all(main_country, '"name":\\s*"|\\"'),
profit = revenue - budget,
budget_category = case_when(
budget >= 150000000 ~ "High",
budget >= 50000000 ~ "Medium",
TRUE ~ "Low"
),
rating_category = case_when(
vote_average >= 7 ~ "High",
vote_average >= 5 ~ "Medium",
TRUE ~ "Low"
)
) %>%
filter(budget > 0,
revenue > 0,
runtime > 0)
interpretation: dataset was cleaned by removing missing values
from important variable such as runtime and release date.new variables
such as budget,year category,rating category and main genere were
creater for analysis.
#Exploratory data analysis
ggplot(movies_clean,aes(x=vote_average))+
geom_histogram(bins=20,fill="thistle",color="black")+
labs(title = "distribution of movies ratings",
x="ratings",
y="numbers of movies")+
theme_minimal()
Interpretation: The graph shows how ratings are spread across all
movies in the dataset.
Insights: Most movies have ratings between 5 and 7. very few movies have extremely high or very low ratings. this means average rated movies are more common.
ggplot(movies_clean,aes(x=runtime))+
geom_histogram(bins=20,fill="lightpink1",color="black")+
labs(title = "distribution of movies ratings",
x="runtime",
y="numbers of movies")+
theme_minimal()
Interpretation: The graph shows how movie runtimes are
distributed.
Insights: Most movies have a runtime in the middle range. very short and very long movies are relatively lesser
ggplot(movies_clean, aes(x = vote_count, y = vote_average)) +
geom_point(alpha = 0.5, color = "pink3") +
geom_smooth(method = "lm", color = "darkgreen") +
labs(title = "Vote Count vs Rating",
x = "Number of Votes",
y = "Rating") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Interpretation: This graph shows the relationship between vote
count and ratings.
Insights: Movies with more votes have more stable ratings. movies with fewer votes show more variation. this means ratings become more reliable when more people vote.
ggplot(movies_clean,aes(x=runtime,y=vote_average))+
geom_point(alpha=0.5,color="violet")+
geom_smooth(method = "lm",color="skyblue")+
labs(title = "runtime vs rating",
x="runtime",
y="rating")+
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Interpretation: This graph shows how runtime is related to
ratings.
Insights: From the graph, there is a slight positive relationship between runtime and rating.That means as runtime increases, rating tends to increase a little, but not very strongly.
movies_year <- movies_clean %>%
group_by(year) %>%
summarise(total_movies = n(), .groups = "drop")
movies_year
## # A tibble: 89 × 2
## year total_movies
## <dbl> <int>
## 1 1916 1
## 2 1925 1
## 3 1927 1
## 4 1929 1
## 5 1930 1
## 6 1932 1
## 7 1933 2
## 8 1934 1
## 9 1935 1
## 10 1936 2
## # ℹ 79 more rows
ggplot(movies_year, aes(x = year, y = total_movies)) +
geom_line(color = "violet",linewidth =1) +
labs(title = "Number of Movies Released Per Year",
x = "Year",
y = "Number of Movies") +
theme_minimal()
Interpretation: This graph shows how many movies were released each
year.
Insights: The number of movies increases over time. this shows growth in the film industry.
profit_year <- movies_clean %>%
group_by(year) %>%
summarise(avg_profit = mean(profit), .groups = "drop")
profit_year
## # A tibble: 89 × 2
## year avg_profit
## <dbl> <dbl>
## 1 1916 8008844
## 2 1925 21755000
## 3 1927 -91969578
## 4 1929 3979000
## 5 1930 4050000
## 6 1932 21
## 7 1933 1921000
## 8 1934 4175000
## 9 1935 2593000
## 10 1936 5018000.
## # ℹ 79 more rows
ggplot(profit_year, aes(x = year, y = avg_profit)) +
geom_line(color = "purple") +
labs(title = "avg Profit Over Years",
x = "Year",
y = "avg Profit") +
theme_minimal()+scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
Interpretation: This graph shows how average profit changes over
years.
Insights: Profit is not stable every year. some years have higher profits than others. this shows fluctuation in movie success.
revenue_year <- movies_clean %>%
group_by(year) %>%
summarise(total_revenue = sum(revenue), .groups = "drop")
revenue_year
## # A tibble: 89 × 2
## year total_revenue
## <dbl> <dbl>
## 1 1916 8394751
## 2 1925 22000000
## 3 1927 650422
## 4 1929 4358000
## 5 1930 8000000
## 6 1932 25
## 7 1933 4481000
## 8 1934 4500000
## 9 1935 3202000
## 10 1936 11236000
## # ℹ 79 more rows
ggplot(revenue_year, aes(x = year, y = total_revenue)) +
geom_line(color = "red") +
labs(title = "Total Revenue Over Years",
x = "Year",
y = "Total Revenue") +
theme_minimal()+scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
Interpretation: This graph shows total revenue generated each
year.
Insights: Revenue increases over time. this means the movie industry is growing in value.
profit_category <- movies_clean %>%
group_by(budget_category) %>%
summarise(avg_profit = mean(profit),
total_movies = n(),
.groups = "drop")
profit_category
## # A tibble: 3 × 3
## budget_category avg_profit total_movies
## <chr> <dbl> <int>
## 1 High 421265144. 143
## 2 Low 43045487. 2273
## 3 Medium 125629765. 813
ggplot(profit_category, aes(x = budget_category, y = avg_profit, fill = budget_category)) +
geom_col() +
labs(title = "avg Profit by Budget Category",
x = "Budget Category",
y = "avg Profit (in Millions)") +
theme_minimal() +
scale_fill_manual(values = c(
"Low" = "purple",
"Medium" = "lightblue",
"High" = "darkblue"
)) +
scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M")) +
theme(legend.position = "none")
Interpretation: This graph compares profit for low, medium and high
budget movies.
Insights: High budget movies earn more profit on average. but they also require more investment.
language_count <- movies_clean %>%
group_by(original_language) %>%
summarise(total_movies = n(), .groups = "drop") %>%
arrange(desc(total_movies)) %>%
head(10)
language_count
## # A tibble: 10 × 2
## original_language total_movies
## <chr> <int>
## 1 en 3102
## 2 fr 25
## 3 es 15
## 4 ja 13
## 5 zh 13
## 6 de 9
## 7 hi 7
## 8 it 6
## 9 ru 6
## 10 cn 5
ggplot(language_count, aes(x = reorder(original_language, total_movies), y = total_movies)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Languages by Number of Movies",
x = "Language",
y = "Number of Movies") +
theme_minimal()
Interpretation: This graph shows top 10 langauges
Insights: Movies with higher ratings tend to earn more revenue. this shows that better movies attract more audience.
top_profit <- movies_clean %>%
arrange(desc(profit)) %>%
select(title, profit) %>%
head(10)
top_profit
## # A tibble: 10 × 2
## title profit
## <chr> <dbl>
## 1 Avatar 2550965087
## 2 Titanic 1645034188
## 3 Jurassic World 1363528810
## 4 Furious 7 1316249360
## 5 The Avengers 1299557910
## 6 Avengers: Age of Ultron 1125403694
## 7 Frozen 1124219009
## 8 Minions 1082730962
## 9 The Lord of the Rings: The Return of the King 1024888979
## 10 Iron Man 3 1015439994
ggplot(top_profit, aes(x = reorder(title, profit), y = profit)) +
geom_col(fill = "pink4") +
coord_flip() +
labs(title = "Top 10 Profitable Movies",
x = "Movie Title",
y = "Profit") +
theme_minimal()
Interpretation: This graph shows the most common languages in the
dataset.
Insights: English movies are highest in number. other languages have much fewer movies.
genre_count <- movies_clean %>%
group_by(main_genre) %>%
summarise(total_movies = n(), .groups = "drop") %>%
arrange(desc(total_movies)) %>%head(10)
genre_count
## # A tibble: 10 × 2
## main_genre total_movies
## <chr> <int>
## 1 Drama 747
## 2 Comedy 634
## 3 Action 588
## 4 Adventure 288
## 5 Horror 197
## 6 Crime 141
## 7 Thriller 118
## 8 Animation 99
## 9 Fantasy 93
## 10 Science Fiction 79
ggplot(genre_count, aes(x = reorder(main_genre, total_movies), y = total_movies)) +
geom_col(fill = "darkcyan") +
coord_flip() +
labs(title = "Top Genres by Number of Movies",
x = "Genre",
y = "Number of Movies") +
theme_minimal()
Interpretation: This graph shows which genres are most common.
Insights: Some genres appear more frequently. this shows that certain types of movies are made more often.
profit_genre <- movies_clean %>%
group_by(main_genre) %>%
summarise(avg_profit = mean(profit),
total_movies = n(),
.groups = "drop") %>%
filter(total_movies > 20)
profit_genre
## # A tibble: 15 × 3
## main_genre avg_profit total_movies
## <chr> <dbl> <int>
## 1 Action 97322816. 588
## 2 Adventure 170718674. 288
## 3 Animation 216260494. 99
## 4 Comedy 56210243. 634
## 5 Crime 39166665. 141
## 6 Documentary 18670263. 30
## 7 Drama 46064427. 747
## 8 Family 172206323. 38
## 9 Fantasy 119642574. 93
## 10 Horror 50941923. 197
## 11 Mystery 65985201. 27
## 12 Romance 69887043. 70
## 13 Science Fiction 146139275. 79
## 14 Thriller 60873100. 118
## 15 Western 32930690 22
ggplot(profit_genre, aes(x = reorder(main_genre, avg_profit), y = avg_profit)) +
geom_col(fill = "purple4") +coord_flip() +
labs(title = "avg Profit by Genre",
x = "Genre",
y = "avg Profit") +
theme_minimal() +
scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
Interpretation: This graph shows average profit for each genre.
Insights: Some genres earn more profit than others. this means genre affects movie earnings.
popularity_genre <- movies_clean %>%
group_by(main_genre) %>%
summarise(avg_popularity = mean(popularity),
total_movies = n(),
.groups = "drop") %>%
filter(total_movies > 20)
popularity_genre
## # A tibble: 15 × 3
## main_genre avg_popularity total_movies
## <chr> <dbl> <int>
## 1 Action 34.6 588
## 2 Adventure 44.6 288
## 3 Animation 41.7 99
## 4 Comedy 20.3 634
## 5 Crime 25.2 141
## 6 Documentary 6.60 30
## 7 Drama 22.5 747
## 8 Family 52.0 38
## 9 Fantasy 40.3 93
## 10 Horror 26.1 197
## 11 Mystery 33.2 27
## 12 Romance 25.9 70
## 13 Science Fiction 49.1 79
## 14 Thriller 31.2 118
## 15 Western 26.8 22
ggplot(popularity_genre, aes(x = reorder(main_genre, avg_popularity), y = avg_popularity)) +
geom_col(fill = "purple") +coord_flip() +
labs(title = "avg Popularity by Genre",
x = "Genre",
y = "Popularity") +
theme_minimal()
Interpretation: This graph shows popularity across different
genres.
Insights: Some genres are more popular among audience. this shows audience interest varies by genre.
revenue_rating <- movies_clean %>%
group_by(rating_category) %>%
summarise(avg_revenue = mean(revenue),
total_movies = n(),
.groups = "drop")
revenue_rating
## # A tibble: 3 × 3
## rating_category avg_revenue total_movies
## <chr> <dbl> <int>
## 1 High 168980432. 768
## 2 Low 43147160. 192
## 3 Medium 111693381. 2269
ggplot(revenue_rating, aes(x = rating_category, y = avg_revenue, fill = rating_category)) +
geom_col() +
labs(title = "avg Revenue by Rating Category",
x = "Rating Category",
y = "avg Revenue") +
theme_minimal() +
scale_fill_manual(values = c("Low" = "seagreen2", "Medium" = "green4", "High" = "darkgreen")) +
scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
Interpretation: This graph shows revenue based on rating
levels.
Insights: Higher rated movies earn more money. this shows quality may influence earnings.
movies_clean <- movies_clean %>%
mutate(success = ifelse(profit > 0, "Hit", "Flop"))
success_count <- movies_clean %>%
group_by(success) %>%
summarise(total = n(), .groups = "drop")
success_count
## # A tibble: 2 × 2
## success total
## <chr> <int>
## 1 Flop 791
## 2 Hit 2438
Interpretation: This table shows how many movies are hits or
flops.
Insights: Not all movies make profit. some movies fail even after release.
movies_clean <- movies_clean %>%
mutate(runtime_group = case_when(
runtime < 90 ~ "Short",
runtime <= 120 ~ "Medium",
TRUE ~ "Long"
))
runtime_perf <- movies_clean %>%
group_by(runtime_group) %>%
summarise(
avg_rating = mean(vote_average),
avg_profit = mean(profit),
.groups = "drop"
)
runtime_perf
## # A tibble: 3 × 3
## runtime_group avg_rating avg_profit
## <chr> <dbl> <dbl>
## 1 Long 6.79 138279307.
## 2 Medium 6.19 62110087.
## 3 Short 5.83 51957018.
Interpretation:
This table shows the average rating and average profit for movies grouped by their runtime into short, medium, and long categories. It helps in comparing how movie length is related to both audience ratings and financial performance.
Insights:
From the table, it can be observed that long movies have the highest average rating and profit, followed by medium movies, while short movies have the lowest values. This indicates that longer movies tend to perform better overall. One possible reason is that long movies often have bigger budgets, better storytelling, and higher production quality, which attract more audience and generate higher revenue.
high_budget_risk <- movies_clean %>%
filter(budget_category == "High") %>%
mutate(failure = ifelse(profit <= 0, "Loss", "Profit")) %>%
group_by(failure) %>%
summarise(total = n(), .groups = "drop")
high_budget_risk
## # A tibble: 2 × 2
## failure total
## <chr> <int>
## 1 Loss 6
## 2 Profit 137
Interpretation: This table shows profit and loss in high budget
movies.
Insights: Even high budget movies can fail. this shows high investment also has high risk.
gap_analysis <- movies_clean %>%
mutate(
popularity_scaled = (popularity - min(popularity)) /
(max(popularity) - min(popularity)),
rating_scaled = vote_average / 10,
popularity_rating_gap = popularity_scaled - rating_scaled
) %>%
arrange(desc(popularity_rating_gap)) %>%
select(title, popularity, vote_average,
popularity_scaled, rating_scaled,
popularity_rating_gap) %>%
head(10)
gap_analysis
## # A tibble: 10 × 6
## title popularity vote_average popularity_scaled rating_scaled
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Minions 876. 6.4 1 0.64
## 2 Interstellar 724. 8.1 0.827 0.81
## 3 Naturally Native 0.579 0 0.000638 0
## 4 Mi America 0.0390 0 0.0000217 0
## 5 Deadpool 515. 7.4 0.588 0.74
## 6 Jurassic World 419. 6.5 0.478 0.65
## 7 Foodfight! 5.26 2.3 0.00598 0.23
## 8 Mad Max: Fury Road 434. 7.2 0.496 0.72
## 9 Guardians of the Gal… 481. 7.9 0.549 0.79
## 10 Disaster Movie 16.2 3 0.0185 0.3
## # ℹ 1 more variable: popularity_rating_gap <dbl>
Interpretation: This table shows difference between popularity
and rating.
Insights: Some movies are popular but not highly rated. this shows popularity does not always reflect quality.
runtime_genre <- movies_clean %>%
group_by(main_genre) %>%
summarise(avg_runtime = mean(runtime),
total_movies = n(),
.groups = "drop") %>%
filter(total_movies > 20)
runtime_genre
## # A tibble: 15 × 3
## main_genre avg_runtime total_movies
## <chr> <dbl> <int>
## 1 Action 111. 588
## 2 Adventure 116. 288
## 3 Animation 91.3 99
## 4 Comedy 103. 634
## 5 Crime 116. 141
## 6 Documentary 98.6 30
## 7 Drama 121. 747
## 8 Family 94.2 38
## 9 Fantasy 108. 93
## 10 Horror 98.1 197
## 11 Mystery 105. 27
## 12 Romance 108. 70
## 13 Science Fiction 113. 79
## 14 Thriller 111. 118
## 15 Western 122. 22
ggplot(runtime_genre, aes(x = reorder(main_genre, avg_runtime), y = avg_runtime)) +
geom_col(fill = "thistle") +
coord_flip() +
labs(title = "avg Runtime by Genre",
x = "Genre",
y = "avg Runtime") +
theme_minimal()
Interpretation: This graph shows average runtime for each
genre.
Insights: Different genres have different runtimes. this shows storytelling style varies by genre.
revenue_genre <- movies_clean %>%
group_by(main_genre) %>%
summarise(avg_revenue = mean(revenue),
total_movies = n(),
.groups = "drop") %>%
filter(total_movies > 20)
revenue_genre
## # A tibble: 15 × 3
## main_genre avg_revenue total_movies
## <chr> <dbl> <int>
## 1 Action 155542667. 588
## 2 Adventure 246084431. 288
## 3 Animation 298941629. 99
## 4 Comedy 83654762. 634
## 5 Crime 66727636. 141
## 6 Documentary 23801772. 30
## 7 Drama 72425378. 747
## 8 Family 234061586. 38
## 9 Fantasy 183410733. 93
## 10 Horror 67269518. 197
## 11 Mystery 98494830. 27
## 12 Romance 98928146. 70
## 13 Science Fiction 204763414. 79
## 14 Thriller 99042761. 118
## 15 Western 60836316. 22
ggplot(revenue_genre, aes(x = reorder(main_genre, avg_revenue), y = avg_revenue)) +
geom_col(fill = "orange") +
coord_flip() +
labs(title = "avg revenue by genre",
x = "genre",
y = "avg revenue") +
theme_minimal()
Interpretation: This graph shows average revenue earned by
different genres.
Insights: Some genres generate higher revenue compared to others. this means genre selection plays an important role in movie earnings.
genre_time <- movies_clean %>%
group_by(year, main_genre) %>%
summarise(total = n(), .groups = "drop") %>%
filter(main_genre %in% head(names(sort(table(movies_clean$main_genre), decreasing = TRUE)),5))
genre_time
## # A tibble: 292 × 3
## year main_genre total
## <dbl> <chr> <int>
## 1 1916 Drama 1
## 2 1925 Drama 1
## 3 1927 Drama 1
## 4 1929 Drama 1
## 5 1930 Action 1
## 6 1932 Drama 1
## 7 1933 Comedy 1
## 8 1934 Comedy 1
## 9 1935 Comedy 1
## 10 1936 Action 1
## # ℹ 282 more rows
ggplot(genre_time, aes(x = year, y = total, color = main_genre)) +
geom_line() +
labs(title = "top genres trend over time",
x = "year",
y = "number of movies") +
theme_minimal()
Interpretation: This graph shows how number of movies in top genres
changes over years.
Insights: Some genres grow more over time while others remain stable. this shows changing trends in audience preferences. The rise of genres like Action and Adventure can also be linked to advancements in technology and visual effects. The slight decline in recent years may be due to changes in distribution platform or the shift towards streaming services.
ggplot(movies_clean, aes(x = budget_category, y = runtime, fill = budget_category)) +
geom_boxplot() +
labs(title = "runtime distribution by budget category",
x = "budget category",
y = "runtime") +
theme_minimal() +
theme(legend.position = "none")
Interpretation: This boxplot shows how runtime varies across
different budget categories.
Insights: High budget movies generally have slightly higher runtime compared to low budget movies. medium budget movies fall in between. however, there is overlap between all categories, showing runtime is not strictly dependent on budget. some low budget movies also have very high runtime, indicating variation in movie length across all budget types
hit_rating <- movies_clean %>%
mutate(success = ifelse(profit > 0, "Hit", "Flop")) %>%
group_by(rating_category, success) %>%
summarise(total = n(), .groups = "drop")
hit_rating
## # A tibble: 6 × 3
## rating_category success total
## <chr> <chr> <int>
## 1 High Flop 88
## 2 High Hit 680
## 3 Low Flop 98
## 4 Low Hit 94
## 5 Medium Flop 605
## 6 Medium Hit 1664
ggplot(hit_rating, aes(x = rating_category, y = total, fill = success)) +
geom_col(position = "dodge") +
labs(title = "hit vs flop by rating category",
x = "rating category",
y = "number of movies") +
theme_minimal()
Interpretation: This graph shows number of hit and flop movies
across rating categories.
Insights: High rated movies have more hits compared to flops. low rated movies tend to fail more. this shows rating impacts success. However, we can also see that even some high rated movies can be flops, and some low rated movies can still be hits. This shows that rating is an important factor, but not the only factor affecting success
lang_rating <- movies_clean %>%
group_by(original_language, rating_category) %>%
summarise(total = n(), .groups = "drop") %>%
filter(original_language %in% names(sort(table(movies_clean$original_language), decreasing = TRUE)[1:6]))
lang_rating
## # A tibble: 14 × 3
## original_language rating_category total
## <chr> <chr> <int>
## 1 de High 8
## 2 de Low 1
## 3 en High 702
## 4 en Low 189
## 5 en Medium 2211
## 6 es High 10
## 7 es Medium 5
## 8 fr High 11
## 9 fr Medium 14
## 10 ja High 7
## 11 ja Medium 6
## 12 zh High 5
## 13 zh Low 1
## 14 zh Medium 7
ggplot(lang_rating, aes(x = original_language, y = total, fill = rating_category)) +
geom_col(position = "dodge") +
labs(title = "language vs rating category",
x = "language",
y = "number of movies") +
theme_minimal()
Interpretation: This graph shows distribution of rating categories
across different languages.
Insights: In graph, English language movies dominate in all categories, especially in the medium rating category, followed by a large number of high rated movies. Other languages like French, Japanese, Spanish, and Chinese have fewer movies overall, but most of them fall into high or medium rating categories.
There are very few low rated movies in these languages compared to English..
numeric_data <- movies_clean %>%
select(budget, revenue, profit, runtime, vote_average, vote_count, popularity)
cor_matrix <- cor(numeric_data)
corrplot(cor_matrix,
method = "circle",
type = "upper",
tl.col = "black",
tl.cex = 0.8)
Interpretation: This correlation plot shows relationship between
numerical variables like budget, revenue, profit, rating, and
popularity.
Insights: Budget and revenue show strong positive relationship. profit is also related to revenue. rating and popularity have weaker relationships.
loss_movies <- movies_clean %>%
arrange(profit) %>%
select(title, profit) %>%
head(10) %>%
mutate(loss = abs(profit))
ggplot(loss_movies, aes(x = reorder(title, loss), y = loss)) +
geom_col(fill = "red") +
coord_flip() +
labs(title = "top 10 loss making movies",
x = "movie",
y = "loss") +
theme_minimal()
Interpretation: This graph shows movies with highest losses.
Insights: Some movies face very high losses. this shows that movie industry has high risk and not all investments are successful.
ggplot(movies_clean, aes(x = vote_count, y = revenue)) +
geom_point(alpha = 0.5, color = "purple") +
geom_smooth(method = "lm", color = "black") +
labs(title = "vote count vs revenue",
x = "vote count",
y = "revenue") +
theme_minimal() +
scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
## `geom_smooth()` using formula = 'y ~ x'
Interpretation: This graph shows relationship between vote count
and revenue.
Insights: Movies with higher votes tend to earn more. this shows audience engagement affects earnings. However, the data points are widely scattered, especially at higher vote counts. This means that although there is a general upward trend, the relationship is not very strong or consistent.This can be because vote count reflects audience engagement, but revenue is influenced by multiple factors like budget, marketing, genre, and release timing. So even if a movie gets many votes, it may not always generate the highest revenue
rating_variation <- movies_clean %>%
group_by(main_genre) %>%
summarise(rating_sd = sd(vote_average),
total_movies = n(),
.groups = "drop") %>%
filter(total_movies > 20)
rating_variation
## # A tibble: 15 × 3
## main_genre rating_sd total_movies
## <chr> <dbl> <int>
## 1 Action 0.857 588
## 2 Adventure 0.818 288
## 3 Animation 0.911 99
## 4 Comedy 0.813 634
## 5 Crime 0.713 141
## 6 Documentary 0.809 30
## 7 Drama 0.859 747
## 8 Family 0.728 38
## 9 Fantasy 0.885 93
## 10 Horror 0.809 197
## 11 Mystery 0.863 27
## 12 Romance 0.817 70
## 13 Science Fiction 0.873 79
## 14 Thriller 0.849 118
## 15 Western 0.685 22
ggplot(rating_variation, aes(x = reorder(main_genre, rating_sd), y = rating_sd)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "rating variation by genre",
x = "genre",
y = "standard deviation of ratings") +
theme_minimal()
Interpretation: This graph shows variation in ratings within each
genre.
Insights: Some genres have consistent ratings while others show high variation. this means audience response differs across genres.
movies_clean <- movies_clean %>%
mutate(genre_count = str_count(genres, "name"))
genre_complexity <- movies_clean %>%
group_by(genre_count) %>%
summarise(avg_rating = mean(vote_average),
avg_profit = mean(profit),
total_movies = n(),
.groups = "drop")
genre_complexity
## # A tibble: 8 × 4
## genre_count avg_rating avg_profit total_movies
## <int> <dbl> <dbl> <int>
## 1 0 5 -327270 1
## 2 1 6.28 46654050. 535
## 3 2 6.36 62260020. 928
## 4 3 6.32 98843290. 1103
## 5 4 6.30 105623279. 483
## 6 5 6.15 103127251. 149
## 7 6 5.8 70435306. 28
## 8 7 5.15 52538631 2
ggplot(genre_complexity, aes(x = genre_count, y = avg_rating)) +
geom_line() +
geom_point() +
labs(title = "number of genres vs rating",
x = "number of genres",
y = "average rating") +
theme_minimal()
Interpretation: This graph shows how number of genres affects movie
rating.
Insights: Movies with a moderate number of genres (around 2–4) tend to perform better, while having too many genres can reduce focus and lead to lower ratings,also note that very high genre counts have very few movies, so those values may be less reliable
country_analysis <- movies_clean %>%
group_by(main_country) %>%
summarise(avg_revenue = mean(revenue),
total_movies = n(),
.groups = "drop") %>%
arrange(desc(total_movies)) %>%
head(10)
country_analysis
## # A tibble: 10 × 3
## main_country avg_revenue total_movies
## <chr> <dbl> <int>
## 1 United States of America 124898009. 2246
## 2 United Kingdom 143863130. 254
## 3 Germany 115757931. 144
## 4 Canada 92259291. 130
## 5 France 75047292. 108
## 6 Australia 114679392. 62
## 7 Japan 161400115. 25
## 8 China 186544279. 24
## 9 India 87262122. 23
## 10 Spain 76997000. 22
ggplot(country_analysis, aes(x = reorder(main_country, avg_revenue), y = avg_revenue)) +
geom_col(fill = "darkred") +
coord_flip() +
labs(title = "average revenue by country",
x = "country",
y = "average revenue") +
theme_minimal() +
scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
Interpretation: This graph shows average revenue generated by
movies from different countries.
Insights: Some countries produce higher revenue movies. this shows production location influences earnings.