title: “Movie Recommendation Analysis” author: “Aashish Kushwaha” date: “2026-04-06” output: html_document: toc: true toc_float: collapsed: false number_sections: true theme: cosmo ————
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(tidyr)
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(caret)
## Loading required package: lattice
data <- read_csv("netflix_dataset_clean_titles.csv")
## Rows: 2000 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): title, genre, country, language, type, is_sequel, is_original, pla...
## dbl (13): year, rating, votes, duration_min, budget_million, revenue_million...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- as.data.frame(data)
# Create profit column
data <- data %>%
mutate(profit_million = revenue_million - budget_million)
str(data)
## 'data.frame': 2000 obs. of 21 variables:
## $ title : chr "Golden Odyssey" "Velvet Nights" "Fading Memories" "Secret Nights" ...
## $ genre : chr "Action" "Drama" "Drama" "Animation" ...
## $ year : num 2015 2022 2000 1999 2014 ...
## $ rating : num 5.58 7.07 5.67 4.48 4.01 6.47 8.6 7.15 9.5 5.58 ...
## $ votes : num 1432041 273667 3454402 4733361 2030122 ...
## $ duration_min : num 99 106 100 106 162 79 136 168 124 175 ...
## $ country : chr "Australia" "Canada" "UK" "Spain" ...
## $ language : chr "French" "Korean" "Spanish" "French" ...
## $ type : chr "Movie" "Movie" "TV Show" "Movie" ...
## $ budget_million : num 292.01 91.46 208.36 3.66 57.4 ...
## $ revenue_million : num 641 381 185 760 208 ...
## $ popularity_score : num 94.9 77.6 96.3 26.4 83.7 ...
## $ awards_won : num 7 8 9 10 0 14 7 18 11 5 ...
## $ critic_score : num 44.6 51.8 38.1 62 98.9 ...
## $ user_score : num 99.3 59.5 36.8 38.4 70.9 ...
## $ is_sequel : chr "No" "No" "No" "No" ...
## $ is_original : chr "No" "Yes" "Yes" "No" ...
## $ platform : chr "Netflix" "Netflix" "Hulu" "Disney+" ...
## $ release_month : num 11 11 6 10 12 6 12 3 6 10 ...
## $ weekend_collection_million: num 4.78 7.6 44.96 37.78 18.59 ...
## $ profit_million : num 349.4 289.5 -22.9 756.6 150.2 ...
dim(data)
## [1] 2000 21
Answer: Shows structure and dimensions of dataset.
data %>%
arrange(desc(revenue_million)) %>%
head(1)
## title genre year rating votes duration_min country language type
## 1 Lost Destiny Action 1990 4.55 4747176 128 Japan German TV Show
## budget_million revenue_million popularity_score awards_won critic_score
## 1 180.75 999.7 1.18 3 85.18
## user_score is_sequel is_original platform release_month
## 1 70.83 Yes Yes Disney+ 9
## weekend_collection_million profit_million
## 1 4.28 818.95
Answer: Movie with maximum revenue.
data %>%
arrange(desc(rating)) %>%
select(title, rating) %>%
head(10)
## title rating
## 1 Shadow Legends 9.50
## 2 Last Memories 9.49
## 3 Rising Worlds 9.49
## 4 Neon Odyssey 9.48
## 5 Final Whispers 9.48
## 6 Velvet Ashes 9.48
## 7 Fading Legends 9.48
## 8 Golden Kingdom 9.48
## 9 Neon Storm 9.47
## 10 Shadow Destiny 9.46
Answer: Top rated movies.
data %>%
filter(rating > 8, genre == "Action") %>%
select(title, rating)
## title rating
## 1 Eternal Legends 8.60
## 2 Burning Truth 8.85
## 3 Eternal Mirage 8.05
## 4 Secret Sky 9.19
## 5 Hidden Destiny 8.68
## 6 Lost Memories 8.21
## 7 Burning Ashes 8.24
## 8 Neon Ashes 8.57
## 9 Shadow Odyssey 8.43
## 10 Shadow Destiny 9.28
## 11 Lost Nights 8.60
## 12 Hidden Empire 8.83
## 13 Broken Ashes 9.01
## 14 Midnight Pursuit 9.19
## 15 Velvet Nights 9.05
## 16 Final Nights 8.23
## 17 Fading Chapter 9.21
## 18 Silent Legends 9.16
## 19 Broken Mirage 9.35
## 20 Shadow Sky 8.51
## 21 Broken Echoes 8.47
## 22 Broken Chapter 9.30
## 23 Hidden Chapter 8.81
## 24 Forgotten Memories 9.32
## 25 Neon Worlds 8.18
## 26 Hidden Frontier 8.79
## 27 Parallel Illusion 9.03
## 28 Crimson Truth 9.20
## 29 Burning Mirage 8.46
## 30 Neon Echoes 9.07
## 31 Lost Pursuit 9.42
## 32 Lost Horizon 9.17
## 33 Final Destiny 8.71
## 34 Lost Illusion 8.84
## 35 Parallel Frontier 9.40
## 36 Silent Nights 8.31
## 37 Dark Odyssey 9.31
## 38 Lost Odyssey 8.77
## 39 Silent Destiny 9.34
## 40 Fading Odyssey 8.80
## 41 Midnight Horizon 8.94
## 42 Forgotten Worlds 9.07
## 43 Last Memories 8.08
## 44 Secret Storm 8.20
## 45 Broken Empire 9.29
## 46 Dark Pursuit 9.03
## 47 Parallel Truth 8.95
## 48 Fading Storm 8.07
## 49 Broken Horizon 9.01
## 50 Dark Storm 8.84
## 51 Burning Worlds 8.44
Answer: High-performing action movies.
data <- data %>%
mutate(roi = revenue_million / budget_million)
data %>%
filter(budget_million > 100) %>%
arrange(desc(roi)) %>%
select(title, roi) %>%
head(10)
## title roi
## 1 Crimson Odyssey 9.355942
## 2 Fading Frontier 9.222539
## 3 Velvet Memories 9.018171
## 4 Neon Truth 8.924492
## 5 Forgotten Empire 8.791148
## 6 Crimson Ashes 8.700697
## 7 Fading Destiny 8.669961
## 8 Forgotten Mirage 8.580821
## 9 Rising Storm 8.468831
## 10 Crimson Odyssey 8.433574
Answer: ROI shows profitability efficiency.
data %>%
filter(rating > 7.5) %>%
group_by(genre) %>%
summarise(avg_profit = mean(profit_million, na.rm=TRUE))
## # A tibble: 10 × 2
## genre avg_profit
## <chr> <dbl>
## 1 Action 324.
## 2 Animation 345.
## 3 Comedy 362.
## 4 Documentary 308.
## 5 Drama 381.
## 6 Fantasy 296.
## 7 Horror 317.
## 8 Romance 360.
## 9 Sci-Fi 293.
## 10 Thriller 338.
Answer: Shows best performing genres.
data %>%
group_by(country, platform) %>%
summarise(total_revenue = sum(revenue_million, na.rm=TRUE)) %>%
arrange(desc(total_revenue)) %>%
head(10)
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by country and platform.
## ℹ Output is grouped by country.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(country, platform))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
## # A tibble: 10 × 3
## # Groups: country [8]
## country platform total_revenue
## <chr> <chr> <dbl>
## 1 UK Hulu 32373.
## 2 USA Hulu 32338.
## 3 Japan Disney+ 32067.
## 4 Germany Disney+ 30951.
## 5 South Korea Disney+ 29692.
## 6 USA Netflix 29254.
## 7 Spain Netflix 29161.
## 8 Canada Netflix 27744.
## 9 Japan Hulu 27328.
## 10 France Netflix 27151.
Answer: Top revenue contributors.
data %>%
filter(rating > 8, revenue_million < 100) %>%
select(title, rating, revenue_million)
## title rating revenue_million
## 1 Hidden Storm 9.13 39.77
## 2 Broken Legends 8.73 35.83
## 3 Golden Worlds 9.00 31.03
## 4 Hidden Destiny 8.68 27.03
## 5 Lost Worlds 8.20 17.06
## 6 Fading Frontier 9.25 97.57
## 7 Lost Whispers 8.55 65.75
## 8 Burning Chapter 9.27 39.62
## 9 Velvet Kingdom 8.42 65.64
## 10 Shadow Odyssey 8.01 73.52
## 11 Fading Chapter 9.21 89.55
## 12 Hidden Odyssey 9.14 10.53
## 13 Forgotten Echoes 9.02 90.08
## 14 Neon Storm 9.21 61.59
## 15 Parallel Empire 8.58 30.55
## 16 Secret Illusion 9.42 46.92
## 17 Fading Empire 8.81 62.26
## 18 Burning Odyssey 8.95 44.04
## 19 Hidden Frontier 8.79 53.30
## 20 Dark Empire 9.44 29.58
## 21 Broken Chapter 8.47 98.92
## 22 Fading Worlds 8.23 32.13
## 23 Velvet Nights 8.12 78.67
## 24 Hidden Whispers 8.85 81.86
## 25 Eternal Destiny 8.97 25.04
## 26 Forgotten Empire 8.82 1.86
## 27 Dark Truth 8.58 42.89
## 28 Dark Echoes 8.91 41.02
## 29 Parallel Legends 8.41 60.38
## 30 Secret Mirage 8.69 20.58
## 31 Silent Kingdom 8.25 24.88
## 32 Forgotten Whispers 8.01 15.34
## 33 Dark Storm 8.30 61.91
## 34 Golden Ashes 8.16 82.74
## 35 Secret Memories 8.25 86.16
## 36 Final Echoes 9.22 60.21
## 37 Crimson Sky 8.74 12.51
## 38 Forgotten Kingdom 8.98 11.42
## 39 Rising Mirage 8.41 84.39
## 40 Burning Pursuit 9.13 29.71
## 41 Eternal Truth 9.07 85.67
## 42 Parallel Sky 9.30 33.01
## 43 Fading Odyssey 8.24 52.11
## 44 Crimson Pursuit 8.56 61.37
## 45 Dark Horizon 8.11 9.99
## 46 Forgotten Chapter 8.97 94.73
## 47 Broken Echoes 8.42 45.68
## 48 Last Truth 8.69 66.94
## 49 Broken Worlds 8.34 27.04
## 50 Shadow Legends 8.09 36.95
## 51 Silent Nights 9.12 29.74
## 52 Last Nights 8.72 69.55
## 53 Rising Frontier 8.96 59.54
## 54 Velvet Horizon 8.21 82.43
## 55 Fading Echoes 9.16 29.82
## 56 Parallel Odyssey 8.83 67.42
## 57 Eternal Truth 8.56 32.00
## 58 Secret Whispers 9.05 34.35
## 59 Forgotten Whispers 8.56 85.94
Answer: High rating but low revenue movies.
yearly_data <- data %>%
group_by(year) %>%
summarise(
avg_rating = mean(rating, na.rm=TRUE),
total_revenue = sum(revenue_million, na.rm=TRUE)
)
yearly_data
## # A tibble: 34 × 3
## year avg_rating total_revenue
## <dbl> <dbl> <dbl>
## 1 1990 6.33 22517.
## 2 1991 6.86 30901.
## 3 1992 6.50 29930.
## 4 1993 6.28 27038.
## 5 1994 6.57 32108.
## 6 1995 6.70 23779.
## 7 1996 6.20 30031.
## 8 1997 6.85 24714.
## 9 1998 6.51 33133.
## 10 1999 6.92 28731.
## # ℹ 24 more rows
Answer: Yearly trends.
data %>%
group_by(platform) %>%
summarise(
avg_profit = mean(profit_million, na.rm=TRUE),
avg_rating = mean(rating, na.rm=TRUE),
total_movies = n()
)
## # A tibble: 4 × 4
## platform avg_profit avg_rating total_movies
## <chr> <dbl> <dbl> <int>
## 1 Amazon Prime 312. 6.68 475
## 2 Disney+ 349. 6.55 519
## 3 Hulu 332. 6.72 514
## 4 Netflix 347. 6.72 492
Answer: Platform comparison.
data %>%
group_by(language) %>%
summarise(
avg_popularity = mean(popularity_score, na.rm=TRUE),
avg_rating = mean(rating, na.rm=TRUE)
)
## # A tibble: 8 × 3
## language avg_popularity avg_rating
## <chr> <dbl> <dbl>
## 1 Chinese 50.0 6.73
## 2 English 53.1 6.68
## 3 French 51.4 6.61
## 4 German 52.1 6.67
## 5 Hindi 51.5 6.56
## 6 Japanese 48.2 6.82
## 7 Korean 49.3 6.65
## 8 Spanish 50.2 6.61
Answer: Language-wise engagement.
data %>%
group_by(genre) %>%
summarise(
avg_rating = mean(rating, na.rm=TRUE),
rating_sd = sd(rating, na.rm=TRUE),
avg_profit = mean(profit_million, na.rm=TRUE)
)
## # A tibble: 10 × 4
## genre avg_rating rating_sd avg_profit
## <chr> <dbl> <dbl> <dbl>
## 1 Action 6.57 1.66 314.
## 2 Animation 6.59 1.58 327.
## 3 Comedy 6.80 1.55 343.
## 4 Documentary 6.88 1.51 308.
## 5 Drama 6.44 1.54 352.
## 6 Fantasy 6.63 1.55 336.
## 7 Horror 6.74 1.59 347.
## 8 Romance 6.62 1.58 340.
## 9 Sci-Fi 6.65 1.54 345.
## 10 Thriller 6.74 1.57 338.
Answer: Stability using SD.
ggplot(data, aes(x = budget_million, y = revenue_million)) +
geom_point(alpha = 0.5, color = "darkgreen") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Budget vs Revenue Relationship", x = "Budget", y = "Revenue")
## `geom_smooth()` using formula = 'y ~ x'
Answer: Positive relationship.
genre_revenue <- data %>%
group_by(genre) %>%
summarise(total_revenue = sum(revenue_million), .groups = "drop")
ggplot(genre_revenue, aes(x = reorder(genre, -total_revenue), y = total_revenue, fill = genre)) +
geom_bar(stat = "identity") +
labs(title = "Total Revenue by Genre", x = "Genre", y = "Total Revenue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Answer: Genre comparison.
yearly_data %>%
ggplot(aes(x = year, y = total_revenue)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(title = "Yearly Revenue Trend", x = "Year", y = "Revenue")
Answer: Trend analysis.
mean(data$rating)
## [1] 6.665745
median(data$rating)
## [1] 6.6
sd(data$rating)
## [1] 1.568482
quantile(data$rating)
## 0% 25% 50% 75% 100%
## 4.000 5.315 6.600 7.960 9.500
Answer: Central tendency + spread.
Q1 <- quantile(data$revenue_million, 0.25)
Q3 <- quantile(data$revenue_million, 0.75)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5*IQR_val
upper <- Q3 + 1.5*IQR_val
sum(data$revenue_million < lower | data$revenue_million > upper)
## [1] 0
Answer: Detects outliers.
z <- scale(data$rating)
sum(abs(z) > 3, na.rm=TRUE)
## [1] 0
Answer: Extreme values detection.
ggplot(data, aes(x=rating)) +
geom_density(fill="skyblue", alpha=0.5) +
labs(title="Density Plot of Ratings", x="Rating", y="Density")
Answer: Distribution shape.
numeric_data <- data %>% select(where(is.numeric))
cor_matrix <- cor(numeric_data, use="complete.obs")
cor_matrix
## year rating votes
## year 1.0000000000 0.008922438 0.003213675
## rating 0.0089224379 1.000000000 -0.018007572
## votes 0.0032136751 -0.018007572 1.000000000
## duration_min -0.0428898086 -0.012326106 0.003293255
## budget_million 0.0041827498 -0.021969857 0.013043434
## revenue_million 0.0016225631 -0.013533906 0.014207714
## popularity_score -0.0129252097 -0.018545661 0.022628176
## awards_won 0.0208571400 -0.047914174 -0.012863168
## critic_score -0.0054956357 0.007692713 -0.021702091
## user_score 0.0060326496 0.033200573 -0.024237762
## release_month 0.0297635860 -0.007988467 0.009563739
## weekend_collection_million -0.0270523152 -0.023227893 -0.004183373
## profit_million 0.0003859642 -0.006857652 0.010021066
## roi 0.0087078458 0.004399491 -0.023535097
## duration_min budget_million revenue_million
## year -0.0428898086 0.00418275 0.0016225631
## rating -0.0123261058 -0.02196986 -0.0135339061
## votes 0.0032932554 0.01304343 0.0142077141
## duration_min 1.0000000000 0.06067780 0.0342986944
## budget_million 0.0606777952 1.00000000 0.0154235583
## revenue_million 0.0342986944 0.01542356 1.0000000000
## popularity_score -0.0029036704 -0.03336210 -0.0560458608
## awards_won -0.0003804844 -0.01780206 0.0024063133
## critic_score 0.0049251697 -0.02567229 0.0253768063
## user_score 0.0116550324 -0.01955360 0.0205360943
## release_month -0.0139357431 -0.05142280 -0.0229659124
## weekend_collection_million 0.0265800853 -0.01057077 -0.0002487916
## profit_million 0.0159710427 -0.26676331 0.9595329946
## roi -0.0035750553 -0.31366016 0.1861823632
## popularity_score awards_won critic_score
## year -0.01292521 0.0208571400 -0.0054956357
## rating -0.01854566 -0.0479141739 0.0076927131
## votes 0.02262818 -0.0128631678 -0.0217020910
## duration_min -0.00290367 -0.0003804844 0.0049251697
## budget_million -0.03336210 -0.0178020600 -0.0256722940
## revenue_million -0.05604586 0.0024063133 0.0253768063
## popularity_score 1.00000000 0.0203571603 -0.0497163761
## awards_won 0.02035716 1.0000000000 0.0356473586
## critic_score -0.04971638 0.0356473586 1.0000000000
## user_score -0.01079356 0.0142911594 0.0007226204
## release_month -0.02471972 -0.0170061241 0.0703392690
## weekend_collection_million 0.01510772 0.0116372824 0.0147078267
## profit_million -0.04462554 0.0073329786 0.0316901940
## roi -0.01852617 0.0152531612 -0.0103211339
## user_score release_month
## year 0.0060326496 0.029763586
## rating 0.0332005730 -0.007988467
## votes -0.0242377618 0.009563739
## duration_min 0.0116550324 -0.013935743
## budget_million -0.0195535991 -0.051422798
## revenue_million 0.0205360943 -0.022965912
## popularity_score -0.0107935637 -0.024719719
## awards_won 0.0142911594 -0.017006124
## critic_score 0.0007226204 0.070339269
## user_score 1.0000000000 -0.001271712
## release_month -0.0012717116 1.000000000
## weekend_collection_million 0.0020579331 -0.004359497
## profit_million 0.0253011380 -0.007654120
## roi 0.0159507822 0.002817174
## weekend_collection_million profit_million
## year -0.0270523152 0.0003859642
## rating -0.0232278927 -0.0068576524
## votes -0.0041833728 0.0100210664
## duration_min 0.0265800853 0.0159710427
## budget_million -0.0105707702 -0.2667633129
## revenue_million -0.0002487916 0.9595329946
## popularity_score 0.0151077187 -0.0446255407
## awards_won 0.0116372824 0.0073329786
## critic_score 0.0147078267 0.0316901940
## user_score 0.0020579331 0.0253011380
## release_month -0.0043594971 -0.0076541201
## weekend_collection_million 1.0000000000 0.0027372387
## profit_million 0.0027372387 1.0000000000
## roi 0.0096396280 0.2677928694
## roi
## year 0.008707846
## rating 0.004399491
## votes -0.023535097
## duration_min -0.003575055
## budget_million -0.313660156
## revenue_million 0.186182363
## popularity_score -0.018526166
## awards_won 0.015253161
## critic_score -0.010321134
## user_score 0.015950782
## release_month 0.002817174
## weekend_collection_million 0.009639628
## profit_million 0.267792869
## roi 1.000000000
Answer: Variable relationships.
data %>%
group_by(platform) %>%
summarise(corr = cor(budget_million, revenue_million, use="complete.obs"))
## # A tibble: 4 × 2
## platform corr
## <chr> <dbl>
## 1 Amazon Prime 0.00422
## 2 Disney+ -0.0189
## 3 Hulu 0.0146
## 4 Netflix 0.0646
Answer: Platform-wise correlation.
res <- rcorr(as.matrix(numeric_data))
res$r
## year rating votes
## year 1.0000000000 0.008922438 0.003213675
## rating 0.0089224379 1.000000000 -0.018007572
## votes 0.0032136751 -0.018007572 1.000000000
## duration_min -0.0428898086 -0.012326106 0.003293255
## budget_million 0.0041827498 -0.021969857 0.013043434
## revenue_million 0.0016225631 -0.013533906 0.014207714
## popularity_score -0.0129252097 -0.018545661 0.022628176
## awards_won 0.0208571400 -0.047914174 -0.012863168
## critic_score -0.0054956357 0.007692713 -0.021702091
## user_score 0.0060326496 0.033200573 -0.024237762
## release_month 0.0297635860 -0.007988467 0.009563739
## weekend_collection_million -0.0270523152 -0.023227893 -0.004183373
## profit_million 0.0003859642 -0.006857652 0.010021066
## roi 0.0087078458 0.004399491 -0.023535097
## duration_min budget_million revenue_million
## year -0.0428898086 0.00418275 0.0016225631
## rating -0.0123261058 -0.02196986 -0.0135339061
## votes 0.0032932554 0.01304343 0.0142077141
## duration_min 1.0000000000 0.06067780 0.0342986944
## budget_million 0.0606777952 1.00000000 0.0154235583
## revenue_million 0.0342986944 0.01542356 1.0000000000
## popularity_score -0.0029036704 -0.03336210 -0.0560458608
## awards_won -0.0003804844 -0.01780206 0.0024063133
## critic_score 0.0049251697 -0.02567229 0.0253768063
## user_score 0.0116550324 -0.01955360 0.0205360943
## release_month -0.0139357431 -0.05142280 -0.0229659124
## weekend_collection_million 0.0265800853 -0.01057077 -0.0002487916
## profit_million 0.0159710427 -0.26676331 0.9595329946
## roi -0.0035750553 -0.31366016 0.1861823632
## popularity_score awards_won critic_score
## year -0.01292521 0.0208571400 -0.0054956357
## rating -0.01854566 -0.0479141739 0.0076927131
## votes 0.02262818 -0.0128631678 -0.0217020910
## duration_min -0.00290367 -0.0003804844 0.0049251697
## budget_million -0.03336210 -0.0178020600 -0.0256722940
## revenue_million -0.05604586 0.0024063133 0.0253768063
## popularity_score 1.00000000 0.0203571603 -0.0497163761
## awards_won 0.02035716 1.0000000000 0.0356473586
## critic_score -0.04971638 0.0356473586 1.0000000000
## user_score -0.01079356 0.0142911594 0.0007226204
## release_month -0.02471972 -0.0170061241 0.0703392690
## weekend_collection_million 0.01510772 0.0116372824 0.0147078267
## profit_million -0.04462554 0.0073329786 0.0316901940
## roi -0.01852617 0.0152531612 -0.0103211339
## user_score release_month
## year 0.0060326496 0.029763586
## rating 0.0332005730 -0.007988467
## votes -0.0242377618 0.009563739
## duration_min 0.0116550324 -0.013935743
## budget_million -0.0195535991 -0.051422798
## revenue_million 0.0205360943 -0.022965912
## popularity_score -0.0107935637 -0.024719719
## awards_won 0.0142911594 -0.017006124
## critic_score 0.0007226204 0.070339269
## user_score 1.0000000000 -0.001271712
## release_month -0.0012717116 1.000000000
## weekend_collection_million 0.0020579331 -0.004359497
## profit_million 0.0253011380 -0.007654120
## roi 0.0159507822 0.002817174
## weekend_collection_million profit_million
## year -0.0270523152 0.0003859642
## rating -0.0232278927 -0.0068576524
## votes -0.0041833728 0.0100210664
## duration_min 0.0265800853 0.0159710427
## budget_million -0.0105707702 -0.2667633129
## revenue_million -0.0002487916 0.9595329946
## popularity_score 0.0151077187 -0.0446255407
## awards_won 0.0116372824 0.0073329786
## critic_score 0.0147078267 0.0316901940
## user_score 0.0020579331 0.0253011380
## release_month -0.0043594971 -0.0076541201
## weekend_collection_million 1.0000000000 0.0027372387
## profit_million 0.0027372387 1.0000000000
## roi 0.0096396280 0.2677928694
## roi
## year 0.008707846
## rating 0.004399491
## votes -0.023535097
## duration_min -0.003575055
## budget_million -0.313660156
## revenue_million 0.186182363
## popularity_score -0.018526166
## awards_won 0.015253161
## critic_score -0.010321134
## user_score 0.015950782
## release_month 0.002817174
## weekend_collection_million 0.009639628
## profit_million 0.267792869
## roi 1.000000000
res$P
## year rating votes duration_min
## year NA 0.69005382 0.8857923 0.055139691
## rating 0.69005382 NA 0.4208861 0.581690909
## votes 0.88579230 0.42088612 NA 0.882984277
## duration_min 0.05513969 0.58169091 0.8829843 NA
## budget_million 0.85170601 0.32608697 0.5599066 0.006639629
## revenue_million 0.94218975 0.54524279 0.5254133 0.125183132
## popularity_score 0.56346958 0.40713839 0.3117965 0.896744303
## awards_won 0.35119292 0.03213896 0.5653437 0.986432514
## critic_score 0.80597602 0.73098271 0.3320193 0.825774647
## user_score 0.78745101 0.13774110 0.2786189 0.602421997
## release_month 0.18334350 0.72106378 0.6690547 0.533372585
## weekend_collection_million 0.22655481 0.29914270 0.8516842 0.234768681
## profit_million 0.98623713 0.75922747 0.6542349 0.475322574
## roi 0.69713514 0.84411776 0.2927953 0.873052890
## budget_million revenue_million popularity_score
## year 0.851706007 0.94218975 0.56346958
## rating 0.326086967 0.54524279 0.40713839
## votes 0.559906630 0.52541334 0.31179649
## duration_min 0.006639629 0.12518313 0.89674430
## budget_million NA 0.49058903 0.13583524
## revenue_million 0.490589026 NA 0.01218122
## popularity_score 0.135835240 0.01218122 NA
## awards_won 0.426207792 0.91435545 0.36286206
## critic_score 0.251145605 0.25664422 0.02619094
## user_score 0.382118040 0.35865830 0.62951108
## release_month 0.021460614 0.30462764 0.26916950
## weekend_collection_million 0.636600721 0.99112823 0.49951449
## profit_million 0.000000000 0.00000000 0.04599260
## roi 0.000000000 0.00000000 0.40763175
## awards_won critic_score user_score release_month
## year 0.35119292 0.805976015 0.7874510 0.183343501
## rating 0.03213896 0.730982715 0.1377411 0.721063775
## votes 0.56534371 0.332019333 0.2786189 0.669054672
## duration_min 0.98643251 0.825774647 0.6024220 0.533372585
## budget_million 0.42620779 0.251145605 0.3821180 0.021460614
## revenue_million 0.91435545 0.256644225 0.3586583 0.304627644
## popularity_score 0.36286206 0.026190938 0.6295111 0.269169501
## awards_won NA 0.111001410 0.5229835 0.447184961
## critic_score 0.11100141 NA 0.9742357 0.001646167
## user_score 0.52298354 0.974235706 NA 0.954674930
## release_month 0.44718496 0.001646167 0.9546749 NA
## weekend_collection_million 0.60297483 0.510937670 0.9267170 0.845516893
## profit_million 0.74310814 0.156571141 0.2580656 0.732280402
## roi 0.49539369 0.644584203 0.4758829 0.899803700
## weekend_collection_million profit_million roi
## year 0.2265548 0.9862371 0.6971351
## rating 0.2991427 0.7592275 0.8441178
## votes 0.8516842 0.6542349 0.2927953
## duration_min 0.2347687 0.4753226 0.8730529
## budget_million 0.6366007 0.0000000 0.0000000
## revenue_million 0.9911282 0.0000000 0.0000000
## popularity_score 0.4995145 0.0459926 0.4076318
## awards_won 0.6029748 0.7431081 0.4953937
## critic_score 0.5109377 0.1565711 0.6445842
## user_score 0.9267170 0.2580656 0.4758829
## release_month 0.8455169 0.7322804 0.8998037
## weekend_collection_million NA 0.9026323 0.6665863
## profit_million 0.9026323 NA 0.0000000
## roi 0.6665863 0.0000000 NA
Answer: p-values for correlation.
num <- data[, c("rating", "votes", "budget_million",
"revenue_million", "profit_million", "popularity_score")]
corr <- cor(num, use = "complete.obs")
corr_df <- as.data.frame(as.table(corr))
ggplot(corr_df, aes(Var1, Var2, fill = Freq)) +
geom_tile() +
scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
labs(title = "Correlation Heatmap", x = "", y = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Answer: Visual correlation.
model_better <- lm(revenue_million ~ budget_million + rating + popularity_score +
votes + duration_min + awards_won, data = data)
summary(model_better)
##
## Call:
## lm(formula = revenue_million ~ budget_million + rating + popularity_score +
## votes + duration_min + awards_won, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -517.69 -258.77 -4.04 250.00 544.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.830e+02 4.317e+01 11.187 <2e-16 ***
## budget_million 3.798e-02 7.670e-02 0.495 0.621
## rating -2.496e+00 4.141e+00 -0.603 0.547
## popularity_score -5.693e-01 2.265e-01 -2.514 0.012 *
## votes 3.021e-06 4.495e-06 0.672 0.502
## duration_min 2.791e-01 1.879e-01 1.485 0.138
## awards_won 1.672e-01 1.131e+00 0.148 0.883
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 289.9 on 1993 degrees of freedom
## Multiple R-squared: 0.004867, Adjusted R-squared: 0.001872
## F-statistic: 1.625 on 6 and 1993 DF, p-value: 0.1363
# Extract values
r2 <- summary(model_better)$r.squared
adj_r2 <- summary(model_better)$adj.r.squared
f_val <- summary(model_better)$fstatistic
r2
## [1] 0.004867397
adj_r2
## [1] 0.001871514
f_val
## value numdf dendf
## 1.624695 6.000000 1993.000000
Answer: R² shows how much variance in revenue is explained by variables Adjusted R² improves accuracy by penalizing extra variables F-value checks overall model significance
model_multi <- lm(revenue_million ~ budget_million + rating + popularity_score, data=data)
summary(model_multi)
##
## Call:
## lm(formula = revenue_million ~ budget_million + rating + popularity_score,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -507.13 -257.28 -2.14 249.36 541.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 525.07153 33.19178 15.819 <2e-16 ***
## budget_million 0.04533 0.07653 0.592 0.5537
## rating -2.64153 4.13519 -0.639 0.5230
## popularity_score -0.56554 0.22634 -2.499 0.0125 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 289.9 on 1996 degrees of freedom
## Multiple R-squared: 0.003529, Adjusted R-squared: 0.002031
## F-statistic: 2.356 on 3 and 1996 DF, p-value: 0.07008
Answer: Multi-variable model.
data$budget_sq <- data$budget_million^2
model_poly <- lm(revenue_million ~ budget_million + budget_sq, data=data)
summary(model_poly)
##
## Call:
## lm(formula = revenue_million ~ budget_million + budget_sq, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -497.52 -257.83 -0.38 251.13 518.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.871e+02 2.048e+01 23.789 <2e-16 ***
## budget_million -1.282e-01 3.079e-01 -0.416 0.677
## budget_sq 5.951e-04 9.805e-04 0.607 0.544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 290.2 on 1997 degrees of freedom
## Multiple R-squared: 0.0004223, Adjusted R-squared: -0.0005788
## F-statistic: 0.4218 on 2 and 1997 DF, p-value: 0.6559
Answer: Non-linear relationship.
par(mfrow=c(2,2))
plot(model_multi)
Answer: Model validation.
new_data <- data.frame(
budget_million = c(50, 200),
rating = c(7, 8),
popularity_score = c(60, 90)
)
predict(model_multi, newdata = new_data)
## 1 2
## 474.9154 462.1080
Answer:
Model predicts revenue for new movies based on input values
set.seed(123)
idx <- sample(1:nrow(data), 0.7*nrow(data))
train <- data[idx,]
test <- data[-idx,]
Answer: Data splitting.
library(caret)
train_control <- trainControl(method = "cv", number = 5)
caret_model <- train(
revenue_million ~ budget_million + rating + popularity_score,
data = data,
method = "lm",
trControl = train_control
)
caret_model
## Linear Regression
##
## 2000 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1600, 1600, 1600, 1600, 1600
## Resampling results:
##
## RMSE Rsquared MAE
## 289.9708 0.001997352 251.5352
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
Answer: ross-validation improves model reliability by testing on multiple data splits