options(scipen = 999)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(stringr)
## Warning: package 'stringr' was built under R version 4.5.3
library(readr)
## Warning: package 'readr' was built under R version 4.5.3
tmdb_5000_movies <- read_csv(
"C:/Users/mkuma/Downloads/archive(2)/tmdb_5000_movies.csv",locale = locale(encoding = "UTF-8"))
## Rows: 4803 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): genres, homepage, keywords, original_language, original_title, ov...
## dbl (7): budget, id, popularity, revenue, runtime, vote_average, vote_count
## date (1): release_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colSums(is.na(tmdb_5000_movies))
## budget genres homepage
## 0 0 3091
## id keywords original_language
## 0 0 0
## original_title overview popularity
## 0 4 0
## production_companies production_countries release_date
## 0 0 1
## revenue runtime spoken_languages
## 0 2 0
## status tagline title
## 0 844 0
## vote_average vote_count
## 0 0
movies_clean <- tmdb_5000_movies %>%
filter(!is.na(runtime),
!is.na(release_date)) %>%
mutate(year = year(as.Date(release_date)),
main_genre = str_extract(genres, '"name":\\s*"[^"]+"'),
main_genre = str_remove_all(main_genre, '"name":\\s*"|\\"'),
profit = revenue - budget,
budget_category = case_when(
budget >= 150000000 ~ "High",
budget >= 50000000 ~ "Medium",
TRUE ~ "Low" ),
rating_category = case_when(
vote_average >= 7 ~ "High",
vote_average >= 5 ~ "Medium",
TRUE ~ "Low")) %>%
filter(budget > 0,
revenue > 0,
runtime > 0)
interpretation: dataset was cleaned by removing missing values
from important variable such as runtime and release date.new variables
such as budget,year category,rating category and main genere were
creater for analysis.
#Exploratory data analysis
ggplot(movies_clean,aes(x=vote_average))+
geom_histogram(bins=20,fill="thistle",color="black")+
labs(title = "distribution of movies ratings",
x="ratings",
y="numbers of movies")+
theme_minimal()
Interpretation: The graph shows how ratings are spread across all
movies in the dataset.
Insights: Most movies have ratings between 5 and 7. very few movies have extremely high or very low ratings. this means average rated movies are more common.
ggplot(movies_clean,aes(x=runtime))+
geom_histogram(bins=20,fill="lightpink1",color="black")+
labs(title = "distribution of movies ratings",
x="runtime",
y="numbers of movies")+
theme_minimal()
Interpretation: The graph shows how movie runtimes are
distributed.
Insights: Most movies have a runtime in the middle range. very short and very long movies are relatively lesser
ggplot(movies_clean, aes(x = vote_count, y = vote_average)) +
geom_point(alpha = 0.5, color = "pink3") +
geom_smooth(method = "lm", color = "darkgreen") +
labs(title = "Vote Count vs Rating",
x = "Number of Votes",
y = "Rating") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Interpretation: This graph shows the relationship between vote
count and ratings.
Insights: Movies with more votes have more stable ratings. movies with fewer votes show more variation. this means ratings become more reliable when more people vote.
ggplot(movies_clean,aes(x=runtime,y=vote_average))+
geom_point(alpha=0.5,color="violet")+
geom_smooth(method = "lm",color="skyblue")+
labs(title = "runtime vs rating",
x="runtime",
y="rating")+
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Interpretation: This graph shows how runtime is related to
ratings.
Insights: From the graph, there is a slight positive relationship between runtime and rating.That means as runtime increases, rating tends to increase a little, but not very strongly.
movies_year <- movies_clean %>%
group_by(year) %>%
summarise(total_movies = n(), .groups = "drop")
movies_year
## # A tibble: 89 × 2
## year total_movies
## <dbl> <int>
## 1 1916 1
## 2 1925 1
## 3 1927 1
## 4 1929 1
## 5 1930 1
## 6 1932 1
## 7 1933 2
## 8 1934 1
## 9 1935 1
## 10 1936 2
## # ℹ 79 more rows
ggplot(movies_year, aes(x = year, y = total_movies)) +
geom_line(color = "violet",linewidth =1) +
labs(title = "Number of Movies Released Per Year",
x = "Year",
y = "Number of Movies") +
theme_minimal()
Interpretation: This graph shows how many movies were released each
year.
Insights: The number of movies increases over time. this shows growth in the film industry.
profit_year <- movies_clean %>%
group_by(year) %>%
summarise(avg_profit = mean(profit), .groups = "drop")
profit_year
## # A tibble: 89 × 2
## year avg_profit
## <dbl> <dbl>
## 1 1916 8008844
## 2 1925 21755000
## 3 1927 -91969578
## 4 1929 3979000
## 5 1930 4050000
## 6 1932 21
## 7 1933 1921000
## 8 1934 4175000
## 9 1935 2593000
## 10 1936 5018000.
## # ℹ 79 more rows
ggplot(profit_year, aes(x = year, y = avg_profit)) +
geom_line(color = "purple") +
labs(title = "avg Profit Over Years",
x = "Year",
y = "avg Profit") +
theme_minimal()+scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
Interpretation: This graph shows how average profit changes over
years.
Insights: Profit is not stable every year. some years have higher profits than others. this shows fluctuation in movie success.
revenue_year <- movies_clean %>%
group_by(year) %>%
summarise(total_revenue = sum(revenue), .groups = "drop")
revenue_year
## # A tibble: 89 × 2
## year total_revenue
## <dbl> <dbl>
## 1 1916 8394751
## 2 1925 22000000
## 3 1927 650422
## 4 1929 4358000
## 5 1930 8000000
## 6 1932 25
## 7 1933 4481000
## 8 1934 4500000
## 9 1935 3202000
## 10 1936 11236000
## # ℹ 79 more rows
ggplot(revenue_year, aes(x = year, y = total_revenue)) +
geom_line(color = "red") +
labs(title = "Total Revenue Over Years",
x = "Year",
y = "Total Revenue") +
theme_minimal()+scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
Interpretation: This graph shows total revenue generated each
year.
Insights: Revenue increases over time. this means the movie industry is growing in value.
profit_category <- movies_clean %>%
group_by(budget_category) %>%
summarise(avg_profit = mean(profit),
total_movies = n(),
.groups = "drop")
profit_category
## # A tibble: 3 × 3
## budget_category avg_profit total_movies
## <chr> <dbl> <int>
## 1 High 421265144. 143
## 2 Low 43045487. 2273
## 3 Medium 125629765. 813
ggplot(profit_category, aes(x = budget_category, y = avg_profit, fill = budget_category)) +
geom_col() +
labs(title = "avg Profit by Budget Category",
x = "Budget Category",
y = "avg Profit (in Millions)") +
theme_minimal() +
scale_fill_manual(values = c(
"Low" = "purple",
"Medium" = "lightblue",
"High" = "darkblue"
)) +
scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M")) +
theme(legend.position = "none")
Interpretation: This graph compares profit for low, medium and high
budget movies.
Insights: High budget movies earn more profit on average. but they also require more investment.
language_count <- movies_clean %>%
group_by(original_language) %>%
summarise(total_movies = n(), .groups = "drop") %>%
arrange(desc(total_movies)) %>%
head(10)
language_count
## # A tibble: 10 × 2
## original_language total_movies
## <chr> <int>
## 1 en 3102
## 2 fr 25
## 3 es 15
## 4 ja 13
## 5 zh 13
## 6 de 9
## 7 hi 7
## 8 it 6
## 9 ru 6
## 10 cn 5
ggplot(language_count, aes(x = reorder(original_language, total_movies), y = total_movies)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Languages by Number of Movies",
x = "Language",
y = "Number of Movies") +
theme_minimal()
Interpretation: This graph shows top 10 langauges
Insights: Movies with higher ratings tend to earn more revenue. this shows that better movies attract more audience.
top_profit <- movies_clean %>%
arrange(desc(profit)) %>%
select(title, profit) %>%
head(10)
top_profit
## # A tibble: 10 × 2
## title profit
## <chr> <dbl>
## 1 Avatar 2550965087
## 2 Titanic 1645034188
## 3 Jurassic World 1363528810
## 4 Furious 7 1316249360
## 5 The Avengers 1299557910
## 6 Avengers: Age of Ultron 1125403694
## 7 Frozen 1124219009
## 8 Minions 1082730962
## 9 The Lord of the Rings: The Return of the King 1024888979
## 10 Iron Man 3 1015439994
ggplot(top_profit, aes(x = reorder(title, profit), y = profit)) +
geom_col(fill = "pink4") +
coord_flip() +
labs(title = "Top 10 Profitable Movies",
x = "Movie Title",
y = "Profit") +
theme_minimal()
Interpretation: This graph shows the most common languages in the
dataset.
Insights: English movies are highest in number. other languages have much fewer movies.
genre_count <- movies_clean %>%
group_by(main_genre) %>%
summarise(total_movies = n(), .groups = "drop") %>%
arrange(desc(total_movies)) %>%head(10)
genre_count
## # A tibble: 10 × 2
## main_genre total_movies
## <chr> <int>
## 1 Drama 747
## 2 Comedy 634
## 3 Action 588
## 4 Adventure 288
## 5 Horror 197
## 6 Crime 141
## 7 Thriller 118
## 8 Animation 99
## 9 Fantasy 93
## 10 Science Fiction 79
ggplot(genre_count, aes(x = reorder(main_genre, total_movies), y = total_movies)) +
geom_col(fill = "darkcyan") +
coord_flip() +
labs(title = "Top Genres by Number of Movies",
x = "Genre",
y = "Number of Movies") +
theme_minimal()
Interpretation: This graph shows which genres are most common.
Insights: Some genres appear more frequently. this shows that certain types of movies are made more often.
profit_genre <- movies_clean %>%
group_by(main_genre) %>%
summarise(avg_profit = mean(profit),
total_movies = n(),
.groups = "drop") %>%
filter(total_movies > 20)
profit_genre
## # A tibble: 15 × 3
## main_genre avg_profit total_movies
## <chr> <dbl> <int>
## 1 Action 97322816. 588
## 2 Adventure 170718674. 288
## 3 Animation 216260494. 99
## 4 Comedy 56210243. 634
## 5 Crime 39166665. 141
## 6 Documentary 18670263. 30
## 7 Drama 46064427. 747
## 8 Family 172206323. 38
## 9 Fantasy 119642574. 93
## 10 Horror 50941923. 197
## 11 Mystery 65985201. 27
## 12 Romance 69887043. 70
## 13 Science Fiction 146139275. 79
## 14 Thriller 60873100. 118
## 15 Western 32930690 22
ggplot(profit_genre, aes(x = reorder(main_genre, avg_profit), y = avg_profit)) +
geom_col(fill = "purple4") +coord_flip() +
labs(title = "avg Profit by Genre",
x = "Genre",
y = "avg Profit") +
theme_minimal() +
scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
Interpretation: This graph shows average profit for each genre.
Insights: Some genres earn more profit than others. this means genre affects movie earnings.
popularity_genre <- movies_clean %>%
group_by(main_genre) %>%
summarise(avg_popularity = mean(popularity),
total_movies = n(),
.groups = "drop") %>%
filter(total_movies > 20)
popularity_genre
## # A tibble: 15 × 3
## main_genre avg_popularity total_movies
## <chr> <dbl> <int>
## 1 Action 34.6 588
## 2 Adventure 44.6 288
## 3 Animation 41.7 99
## 4 Comedy 20.3 634
## 5 Crime 25.2 141
## 6 Documentary 6.60 30
## 7 Drama 22.5 747
## 8 Family 52.0 38
## 9 Fantasy 40.3 93
## 10 Horror 26.1 197
## 11 Mystery 33.2 27
## 12 Romance 25.9 70
## 13 Science Fiction 49.1 79
## 14 Thriller 31.2 118
## 15 Western 26.8 22
ggplot(popularity_genre, aes(x = reorder(main_genre, avg_popularity), y = avg_popularity)) +
geom_col(fill = "purple") +coord_flip() +
labs(title = "avg Popularity by Genre",
x = "Genre",
y = "Popularity") +
theme_minimal()
Interpretation: This graph shows popularity across different
genres.
Insights: Some genres are more popular among audience. this shows audience interest varies by genre.
revenue_rating <- movies_clean %>%
group_by(rating_category) %>%
summarise(avg_revenue = mean(revenue),
total_movies = n(),
.groups = "drop")
revenue_rating
## # A tibble: 3 × 3
## rating_category avg_revenue total_movies
## <chr> <dbl> <int>
## 1 High 168980432. 768
## 2 Low 43147160. 192
## 3 Medium 111693381. 2269
ggplot(revenue_rating, aes(x = rating_category, y = avg_revenue, fill = rating_category)) +
geom_col() +
labs(title = "avg Revenue by Rating Category",
x = "Rating Category",
y = "avg Revenue") +
theme_minimal() +
scale_fill_manual(values = c("Low" = "seagreen2", "Medium" = "green4", "High" = "darkgreen")) +
scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))
Interpretation: This graph shows revenue based on rating
levels.
Insights: Higher rated movies earn more money. this shows quality may influence earnings.
movies_clean <- movies_clean %>%
mutate(success = ifelse(profit > 0, "Hit", "Flop"))
success_count <- movies_clean %>%
group_by(success) %>%
summarise(total = n(), .groups = "drop")
success_count
## # A tibble: 2 × 2
## success total
## <chr> <int>
## 1 Flop 791
## 2 Hit 2438
Interpretation: This table shows how many movies are hits or
flops.
Insights: Not all movies make profit. some movies fail even after release.
movies_clean <- movies_clean %>%
mutate(runtime_group = case_when(
runtime < 90 ~ "Short",
runtime <= 120 ~ "Medium",
TRUE ~ "Long"
))
runtime_perf <- movies_clean %>%
group_by(runtime_group) %>%
summarise(
avg_rating = mean(vote_average),
avg_profit = mean(profit),
.groups = "drop"
)
runtime_perf
## # A tibble: 3 × 3
## runtime_group avg_rating avg_profit
## <chr> <dbl> <dbl>
## 1 Long 6.79 138279307.
## 2 Medium 6.19 62110087.
## 3 Short 5.83 51957018.
Interpretation: This table compares rating and profit for short
medium and long movies.
Insights: It helps to see which runtime performs better. some durations may be more effective than others.
high_budget_risk <- movies_clean %>%
filter(budget_category == "High") %>%
mutate(failure = ifelse(profit <= 0, "Loss", "Profit")) %>%
group_by(failure) %>%
summarise(total = n(), .groups = "drop")
high_budget_risk
## # A tibble: 2 × 2
## failure total
## <chr> <int>
## 1 Loss 6
## 2 Profit 137
Interpretation: This table shows profit and loss in high budget
movies.
Insights: Even high budget movies can fail. this shows high investment also has high risk.
movies_clean <- movies_clean %>%
mutate(hype = popularity - vote_average * 10)
hype_movies <- movies_clean %>%
arrange(desc(hype)) %>%
select(title, popularity, vote_average, hype) %>%
head(10)
hype_movies
## # A tibble: 10 × 4
## title popularity vote_average hype
## <chr> <dbl> <dbl> <dbl>
## 1 Minions 876. 6.4 812.
## 2 Interstellar 724. 8.1 643.
## 3 Deadpool 515. 7.4 441.
## 4 Guardians of the Galaxy 481. 7.9 402.
## 5 Mad Max: Fury Road 434. 7.2 362.
## 6 Jurassic World 419. 6.5 354.
## 7 Pirates of the Caribbean: The Curse of the Bla… 272. 7.5 197.
## 8 Dawn of the Planet of the Apes 244. 7.3 171.
## 9 Terminator Genisys 202. 5.8 144.
## 10 The Hunger Games: Mockingjay - Part 1 206. 6.6 140.
Interpretation: This table shows movies with high popularity but
lower ratings.
Insights: These movies may be overhyped. they are popular but not rated highly.
runtime_genre <- movies_clean %>%
group_by(main_genre) %>%
summarise(avg_runtime = mean(runtime),
total_movies = n(),
.groups = "drop") %>%
filter(total_movies > 20)
runtime_genre
## # A tibble: 15 × 3
## main_genre avg_runtime total_movies
## <chr> <dbl> <int>
## 1 Action 111. 588
## 2 Adventure 116. 288
## 3 Animation 91.3 99
## 4 Comedy 103. 634
## 5 Crime 116. 141
## 6 Documentary 98.6 30
## 7 Drama 121. 747
## 8 Family 94.2 38
## 9 Fantasy 108. 93
## 10 Horror 98.1 197
## 11 Mystery 105. 27
## 12 Romance 108. 70
## 13 Science Fiction 113. 79
## 14 Thriller 111. 118
## 15 Western 122. 22
ggplot(runtime_genre, aes(x = reorder(main_genre, avg_runtime), y = avg_runtime)) +
geom_col(fill = "thistle") +
coord_flip() +
labs(title = "avg Runtime by Genre",
x = "Genre",
y = "avg Runtime") +
theme_minimal()
Interpretation: This graph shows average runtime for each
genre.
Insights: Different genres have different runtimes. this shows storytelling style varies by genre.