title: “Movie Recommendation Analysis” author: “Aashish Kushwaha” date: “2026-04-06” output: html_document: toc: true toc_float: collapsed: false number_sections: true theme: cosmo ————

0.1 Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(tidyr)
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(caret)
## Loading required package: lattice

0.2 Load Dataset

data <- read_csv("netflix_dataset_clean_titles.csv")
## Rows: 2000 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): title, genre, country, language, type, is_sequel, is_original, pla...
## dbl (13): year, rating, votes, duration_min, budget_million, revenue_million...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- as.data.frame(data)

# Create profit column
data <- data %>%
  mutate(profit_million = revenue_million - budget_million)

1 Level 1: Data Understanding

1.0.1 Q1: Structure of dataset

str(data)
## 'data.frame':    2000 obs. of  21 variables:
##  $ title                     : chr  "Golden Odyssey" "Velvet Nights" "Fading Memories" "Secret Nights" ...
##  $ genre                     : chr  "Action" "Drama" "Drama" "Animation" ...
##  $ year                      : num  2015 2022 2000 1999 2014 ...
##  $ rating                    : num  5.58 7.07 5.67 4.48 4.01 6.47 8.6 7.15 9.5 5.58 ...
##  $ votes                     : num  1432041 273667 3454402 4733361 2030122 ...
##  $ duration_min              : num  99 106 100 106 162 79 136 168 124 175 ...
##  $ country                   : chr  "Australia" "Canada" "UK" "Spain" ...
##  $ language                  : chr  "French" "Korean" "Spanish" "French" ...
##  $ type                      : chr  "Movie" "Movie" "TV Show" "Movie" ...
##  $ budget_million            : num  292.01 91.46 208.36 3.66 57.4 ...
##  $ revenue_million           : num  641 381 185 760 208 ...
##  $ popularity_score          : num  94.9 77.6 96.3 26.4 83.7 ...
##  $ awards_won                : num  7 8 9 10 0 14 7 18 11 5 ...
##  $ critic_score              : num  44.6 51.8 38.1 62 98.9 ...
##  $ user_score                : num  99.3 59.5 36.8 38.4 70.9 ...
##  $ is_sequel                 : chr  "No" "No" "No" "No" ...
##  $ is_original               : chr  "No" "Yes" "Yes" "No" ...
##  $ platform                  : chr  "Netflix" "Netflix" "Hulu" "Disney+" ...
##  $ release_month             : num  11 11 6 10 12 6 12 3 6 10 ...
##  $ weekend_collection_million: num  4.78 7.6 44.96 37.78 18.59 ...
##  $ profit_million            : num  349.4 289.5 -22.9 756.6 150.2 ...
dim(data)
## [1] 2000   21

Answer: Shows structure and dimensions of dataset.


1.0.2 Q2: Highest revenue movie

data %>%
  arrange(desc(revenue_million)) %>%
  head(1)
##          title  genre year rating   votes duration_min country language    type
## 1 Lost Destiny Action 1990   4.55 4747176          128   Japan   German TV Show
##   budget_million revenue_million popularity_score awards_won critic_score
## 1         180.75           999.7             1.18          3        85.18
##   user_score is_sequel is_original platform release_month
## 1      70.83       Yes         Yes  Disney+             9
##   weekend_collection_million profit_million
## 1                       4.28         818.95

Answer: Movie with maximum revenue.


1.0.3 Q3: Top 10 highest rated movies

data %>%
  arrange(desc(rating)) %>%
  select(title, rating) %>%
  head(10)
##             title rating
## 1  Shadow Legends   9.50
## 2   Last Memories   9.49
## 3   Rising Worlds   9.49
## 4    Neon Odyssey   9.48
## 5  Final Whispers   9.48
## 6    Velvet Ashes   9.48
## 7  Fading Legends   9.48
## 8  Golden Kingdom   9.48
## 9      Neon Storm   9.47
## 10 Shadow Destiny   9.46

Answer: Top rated movies.


1.0.4 Q4: Action movies with rating > 8

data %>%
  filter(rating > 8, genre == "Action") %>%
  select(title, rating)
##                 title rating
## 1     Eternal Legends   8.60
## 2       Burning Truth   8.85
## 3      Eternal Mirage   8.05
## 4          Secret Sky   9.19
## 5      Hidden Destiny   8.68
## 6       Lost Memories   8.21
## 7       Burning Ashes   8.24
## 8          Neon Ashes   8.57
## 9      Shadow Odyssey   8.43
## 10     Shadow Destiny   9.28
## 11        Lost Nights   8.60
## 12      Hidden Empire   8.83
## 13       Broken Ashes   9.01
## 14   Midnight Pursuit   9.19
## 15      Velvet Nights   9.05
## 16       Final Nights   8.23
## 17     Fading Chapter   9.21
## 18     Silent Legends   9.16
## 19      Broken Mirage   9.35
## 20         Shadow Sky   8.51
## 21      Broken Echoes   8.47
## 22     Broken Chapter   9.30
## 23     Hidden Chapter   8.81
## 24 Forgotten Memories   9.32
## 25        Neon Worlds   8.18
## 26    Hidden Frontier   8.79
## 27  Parallel Illusion   9.03
## 28      Crimson Truth   9.20
## 29     Burning Mirage   8.46
## 30        Neon Echoes   9.07
## 31       Lost Pursuit   9.42
## 32       Lost Horizon   9.17
## 33      Final Destiny   8.71
## 34      Lost Illusion   8.84
## 35  Parallel Frontier   9.40
## 36      Silent Nights   8.31
## 37       Dark Odyssey   9.31
## 38       Lost Odyssey   8.77
## 39     Silent Destiny   9.34
## 40     Fading Odyssey   8.80
## 41   Midnight Horizon   8.94
## 42   Forgotten Worlds   9.07
## 43      Last Memories   8.08
## 44       Secret Storm   8.20
## 45      Broken Empire   9.29
## 46       Dark Pursuit   9.03
## 47     Parallel Truth   8.95
## 48       Fading Storm   8.07
## 49     Broken Horizon   9.01
## 50         Dark Storm   8.84
## 51     Burning Worlds   8.44

Answer: High-performing action movies.


2 Level 2: Advanced Analysis

2.0.1 Q5: ROI analysis

data <- data %>%
  mutate(roi = revenue_million / budget_million)

data %>%
  filter(budget_million > 100) %>%
  arrange(desc(roi)) %>%
  select(title, roi) %>%
  head(10)
##               title      roi
## 1   Crimson Odyssey 9.355942
## 2   Fading Frontier 9.222539
## 3   Velvet Memories 9.018171
## 4        Neon Truth 8.924492
## 5  Forgotten Empire 8.791148
## 6     Crimson Ashes 8.700697
## 7    Fading Destiny 8.669961
## 8  Forgotten Mirage 8.580821
## 9      Rising Storm 8.468831
## 10  Crimson Odyssey 8.433574

Answer: ROI shows profitability efficiency.


2.0.2 Q6: Genre profitability

data %>%
  filter(rating > 7.5) %>%
  group_by(genre) %>%
  summarise(avg_profit = mean(profit_million, na.rm=TRUE))
## # A tibble: 10 × 2
##    genre       avg_profit
##    <chr>            <dbl>
##  1 Action            324.
##  2 Animation         345.
##  3 Comedy            362.
##  4 Documentary       308.
##  5 Drama             381.
##  6 Fantasy           296.
##  7 Horror            317.
##  8 Romance           360.
##  9 Sci-Fi            293.
## 10 Thriller          338.

Answer: Shows best performing genres.


2.0.3 Q7: Country-platform revenue

data %>%
  group_by(country, platform) %>%
  summarise(total_revenue = sum(revenue_million, na.rm=TRUE)) %>%
  arrange(desc(total_revenue)) %>%
  head(10)
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by country and platform.
## ℹ Output is grouped by country.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(country, platform))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
## # A tibble: 10 × 3
## # Groups:   country [8]
##    country     platform total_revenue
##    <chr>       <chr>            <dbl>
##  1 UK          Hulu            32373.
##  2 USA         Hulu            32338.
##  3 Japan       Disney+         32067.
##  4 Germany     Disney+         30951.
##  5 South Korea Disney+         29692.
##  6 USA         Netflix         29254.
##  7 Spain       Netflix         29161.
##  8 Canada      Netflix         27744.
##  9 Japan       Hulu            27328.
## 10 France      Netflix         27151.

Answer: Top revenue contributors.


2.0.4 Q8: Underrated movies

data %>%
  filter(rating > 8, revenue_million < 100) %>%
  select(title, rating, revenue_million)
##                 title rating revenue_million
## 1        Hidden Storm   9.13           39.77
## 2      Broken Legends   8.73           35.83
## 3       Golden Worlds   9.00           31.03
## 4      Hidden Destiny   8.68           27.03
## 5         Lost Worlds   8.20           17.06
## 6     Fading Frontier   9.25           97.57
## 7       Lost Whispers   8.55           65.75
## 8     Burning Chapter   9.27           39.62
## 9      Velvet Kingdom   8.42           65.64
## 10     Shadow Odyssey   8.01           73.52
## 11     Fading Chapter   9.21           89.55
## 12     Hidden Odyssey   9.14           10.53
## 13   Forgotten Echoes   9.02           90.08
## 14         Neon Storm   9.21           61.59
## 15    Parallel Empire   8.58           30.55
## 16    Secret Illusion   9.42           46.92
## 17      Fading Empire   8.81           62.26
## 18    Burning Odyssey   8.95           44.04
## 19    Hidden Frontier   8.79           53.30
## 20        Dark Empire   9.44           29.58
## 21     Broken Chapter   8.47           98.92
## 22      Fading Worlds   8.23           32.13
## 23      Velvet Nights   8.12           78.67
## 24    Hidden Whispers   8.85           81.86
## 25    Eternal Destiny   8.97           25.04
## 26   Forgotten Empire   8.82            1.86
## 27         Dark Truth   8.58           42.89
## 28        Dark Echoes   8.91           41.02
## 29   Parallel Legends   8.41           60.38
## 30      Secret Mirage   8.69           20.58
## 31     Silent Kingdom   8.25           24.88
## 32 Forgotten Whispers   8.01           15.34
## 33         Dark Storm   8.30           61.91
## 34       Golden Ashes   8.16           82.74
## 35    Secret Memories   8.25           86.16
## 36       Final Echoes   9.22           60.21
## 37        Crimson Sky   8.74           12.51
## 38  Forgotten Kingdom   8.98           11.42
## 39      Rising Mirage   8.41           84.39
## 40    Burning Pursuit   9.13           29.71
## 41      Eternal Truth   9.07           85.67
## 42       Parallel Sky   9.30           33.01
## 43     Fading Odyssey   8.24           52.11
## 44    Crimson Pursuit   8.56           61.37
## 45       Dark Horizon   8.11            9.99
## 46  Forgotten Chapter   8.97           94.73
## 47      Broken Echoes   8.42           45.68
## 48         Last Truth   8.69           66.94
## 49      Broken Worlds   8.34           27.04
## 50     Shadow Legends   8.09           36.95
## 51      Silent Nights   9.12           29.74
## 52        Last Nights   8.72           69.55
## 53    Rising Frontier   8.96           59.54
## 54     Velvet Horizon   8.21           82.43
## 55      Fading Echoes   9.16           29.82
## 56   Parallel Odyssey   8.83           67.42
## 57      Eternal Truth   8.56           32.00
## 58    Secret Whispers   9.05           34.35
## 59 Forgotten Whispers   8.56           85.94

Answer: High rating but low revenue movies.


3 Level 3: Grouping

3.0.1 Q9: Year-wise analysis

yearly_data <- data %>%
  group_by(year) %>%
  summarise(
    avg_rating = mean(rating, na.rm=TRUE),
    total_revenue = sum(revenue_million, na.rm=TRUE)
  )

yearly_data
## # A tibble: 34 × 3
##     year avg_rating total_revenue
##    <dbl>      <dbl>         <dbl>
##  1  1990       6.33        22517.
##  2  1991       6.86        30901.
##  3  1992       6.50        29930.
##  4  1993       6.28        27038.
##  5  1994       6.57        32108.
##  6  1995       6.70        23779.
##  7  1996       6.20        30031.
##  8  1997       6.85        24714.
##  9  1998       6.51        33133.
## 10  1999       6.92        28731.
## # ℹ 24 more rows

Answer: Yearly trends.


3.0.2 Q10: Platform performance

data %>%
  group_by(platform) %>%
  summarise(
    avg_profit = mean(profit_million, na.rm=TRUE),
    avg_rating = mean(rating, na.rm=TRUE),
    total_movies = n()
  )
## # A tibble: 4 × 4
##   platform     avg_profit avg_rating total_movies
##   <chr>             <dbl>      <dbl>        <int>
## 1 Amazon Prime       312.       6.68          475
## 2 Disney+            349.       6.55          519
## 3 Hulu               332.       6.72          514
## 4 Netflix            347.       6.72          492

Answer: Platform comparison.


3.0.3 Q11: Language performance

data %>%
  group_by(language) %>%
  summarise(
    avg_popularity = mean(popularity_score, na.rm=TRUE),
    avg_rating = mean(rating, na.rm=TRUE)
  )
## # A tibble: 8 × 3
##   language avg_popularity avg_rating
##   <chr>             <dbl>      <dbl>
## 1 Chinese            50.0       6.73
## 2 English            53.1       6.68
## 3 French             51.4       6.61
## 4 German             52.1       6.67
## 5 Hindi              51.5       6.56
## 6 Japanese           48.2       6.82
## 7 Korean             49.3       6.65
## 8 Spanish            50.2       6.61

Answer: Language-wise engagement.


3.0.4 Q12: Genre stability

data %>%
  group_by(genre) %>%
  summarise(
    avg_rating = mean(rating, na.rm=TRUE),
    rating_sd = sd(rating, na.rm=TRUE),
    avg_profit = mean(profit_million, na.rm=TRUE)
  )
## # A tibble: 10 × 4
##    genre       avg_rating rating_sd avg_profit
##    <chr>            <dbl>     <dbl>      <dbl>
##  1 Action            6.57      1.66       314.
##  2 Animation         6.59      1.58       327.
##  3 Comedy            6.80      1.55       343.
##  4 Documentary       6.88      1.51       308.
##  5 Drama             6.44      1.54       352.
##  6 Fantasy           6.63      1.55       336.
##  7 Horror            6.74      1.59       347.
##  8 Romance           6.62      1.58       340.
##  9 Sci-Fi            6.65      1.54       345.
## 10 Thriller          6.74      1.57       338.

Answer: Stability using SD.


4 Level 4: Visualization

4.0.1 Q13: Budget vs Revenue

ggplot(data, aes(x = budget_million, y = revenue_million)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Budget vs Revenue Relationship", x = "Budget", y = "Revenue")
## `geom_smooth()` using formula = 'y ~ x'

Answer: Positive relationship.


4.0.2 Q14: Genre revenue

genre_revenue <- data %>%
  group_by(genre) %>%
  summarise(total_revenue = sum(revenue_million), .groups = "drop")

ggplot(genre_revenue, aes(x = reorder(genre, -total_revenue), y = total_revenue, fill = genre)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Revenue by Genre", x = "Genre", y = "Total Revenue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Answer: Genre comparison.


4.0.3 Q15: Revenue trend

yearly_data %>%
  ggplot(aes(x = year, y = total_revenue)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  labs(title = "Yearly Revenue Trend", x = "Year", y = "Revenue")

Answer: Trend analysis.


5 Unit IV: EDA

5.0.1 Q16: Descriptive stats

mean(data$rating)
## [1] 6.665745
median(data$rating)
## [1] 6.6
sd(data$rating)
## [1] 1.568482
quantile(data$rating)
##    0%   25%   50%   75%  100% 
## 4.000 5.315 6.600 7.960 9.500

Answer: Central tendency + spread.


5.0.2 Q17: IQR Outliers

Q1 <- quantile(data$revenue_million, 0.25)
Q3 <- quantile(data$revenue_million, 0.75)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5*IQR_val
upper <- Q3 + 1.5*IQR_val

sum(data$revenue_million < lower | data$revenue_million > upper)
## [1] 0

Answer: Detects outliers.


5.0.3 Q18: Z-score Outliers

z <- scale(data$rating)
sum(abs(z) > 3, na.rm=TRUE)
## [1] 0

Answer: Extreme values detection.


5.0.4 Q19: Density plot

ggplot(data, aes(x=rating)) +
  geom_density(fill="skyblue", alpha=0.5) +
  labs(title="Density Plot of Ratings", x="Rating", y="Density")

Answer: Distribution shape.


6 Unit V: Correlation

6.0.1 Q20: Correlation matrix

numeric_data <- data %>% select(where(is.numeric))
cor_matrix <- cor(numeric_data, use="complete.obs")
cor_matrix
##                                     year       rating        votes
## year                        1.0000000000  0.008922438  0.003213675
## rating                      0.0089224379  1.000000000 -0.018007572
## votes                       0.0032136751 -0.018007572  1.000000000
## duration_min               -0.0428898086 -0.012326106  0.003293255
## budget_million              0.0041827498 -0.021969857  0.013043434
## revenue_million             0.0016225631 -0.013533906  0.014207714
## popularity_score           -0.0129252097 -0.018545661  0.022628176
## awards_won                  0.0208571400 -0.047914174 -0.012863168
## critic_score               -0.0054956357  0.007692713 -0.021702091
## user_score                  0.0060326496  0.033200573 -0.024237762
## release_month               0.0297635860 -0.007988467  0.009563739
## weekend_collection_million -0.0270523152 -0.023227893 -0.004183373
## profit_million              0.0003859642 -0.006857652  0.010021066
## roi                         0.0087078458  0.004399491 -0.023535097
##                             duration_min budget_million revenue_million
## year                       -0.0428898086     0.00418275    0.0016225631
## rating                     -0.0123261058    -0.02196986   -0.0135339061
## votes                       0.0032932554     0.01304343    0.0142077141
## duration_min                1.0000000000     0.06067780    0.0342986944
## budget_million              0.0606777952     1.00000000    0.0154235583
## revenue_million             0.0342986944     0.01542356    1.0000000000
## popularity_score           -0.0029036704    -0.03336210   -0.0560458608
## awards_won                 -0.0003804844    -0.01780206    0.0024063133
## critic_score                0.0049251697    -0.02567229    0.0253768063
## user_score                  0.0116550324    -0.01955360    0.0205360943
## release_month              -0.0139357431    -0.05142280   -0.0229659124
## weekend_collection_million  0.0265800853    -0.01057077   -0.0002487916
## profit_million              0.0159710427    -0.26676331    0.9595329946
## roi                        -0.0035750553    -0.31366016    0.1861823632
##                            popularity_score    awards_won  critic_score
## year                            -0.01292521  0.0208571400 -0.0054956357
## rating                          -0.01854566 -0.0479141739  0.0076927131
## votes                            0.02262818 -0.0128631678 -0.0217020910
## duration_min                    -0.00290367 -0.0003804844  0.0049251697
## budget_million                  -0.03336210 -0.0178020600 -0.0256722940
## revenue_million                 -0.05604586  0.0024063133  0.0253768063
## popularity_score                 1.00000000  0.0203571603 -0.0497163761
## awards_won                       0.02035716  1.0000000000  0.0356473586
## critic_score                    -0.04971638  0.0356473586  1.0000000000
## user_score                      -0.01079356  0.0142911594  0.0007226204
## release_month                   -0.02471972 -0.0170061241  0.0703392690
## weekend_collection_million       0.01510772  0.0116372824  0.0147078267
## profit_million                  -0.04462554  0.0073329786  0.0316901940
## roi                             -0.01852617  0.0152531612 -0.0103211339
##                               user_score release_month
## year                        0.0060326496   0.029763586
## rating                      0.0332005730  -0.007988467
## votes                      -0.0242377618   0.009563739
## duration_min                0.0116550324  -0.013935743
## budget_million             -0.0195535991  -0.051422798
## revenue_million             0.0205360943  -0.022965912
## popularity_score           -0.0107935637  -0.024719719
## awards_won                  0.0142911594  -0.017006124
## critic_score                0.0007226204   0.070339269
## user_score                  1.0000000000  -0.001271712
## release_month              -0.0012717116   1.000000000
## weekend_collection_million  0.0020579331  -0.004359497
## profit_million              0.0253011380  -0.007654120
## roi                         0.0159507822   0.002817174
##                            weekend_collection_million profit_million
## year                                    -0.0270523152   0.0003859642
## rating                                  -0.0232278927  -0.0068576524
## votes                                   -0.0041833728   0.0100210664
## duration_min                             0.0265800853   0.0159710427
## budget_million                          -0.0105707702  -0.2667633129
## revenue_million                         -0.0002487916   0.9595329946
## popularity_score                         0.0151077187  -0.0446255407
## awards_won                               0.0116372824   0.0073329786
## critic_score                             0.0147078267   0.0316901940
## user_score                               0.0020579331   0.0253011380
## release_month                           -0.0043594971  -0.0076541201
## weekend_collection_million               1.0000000000   0.0027372387
## profit_million                           0.0027372387   1.0000000000
## roi                                      0.0096396280   0.2677928694
##                                     roi
## year                        0.008707846
## rating                      0.004399491
## votes                      -0.023535097
## duration_min               -0.003575055
## budget_million             -0.313660156
## revenue_million             0.186182363
## popularity_score           -0.018526166
## awards_won                  0.015253161
## critic_score               -0.010321134
## user_score                  0.015950782
## release_month               0.002817174
## weekend_collection_million  0.009639628
## profit_million              0.267792869
## roi                         1.000000000

Answer: Variable relationships.


6.0.2 Q21: Platform correlation

data %>%
  group_by(platform) %>%
  summarise(corr = cor(budget_million, revenue_million, use="complete.obs"))
## # A tibble: 4 × 2
##   platform         corr
##   <chr>           <dbl>
## 1 Amazon Prime  0.00422
## 2 Disney+      -0.0189 
## 3 Hulu          0.0146 
## 4 Netflix       0.0646

Answer: Platform-wise correlation.


6.0.3 Q22: Significance test

res <- rcorr(as.matrix(numeric_data))
res$r
##                                     year       rating        votes
## year                        1.0000000000  0.008922438  0.003213675
## rating                      0.0089224379  1.000000000 -0.018007572
## votes                       0.0032136751 -0.018007572  1.000000000
## duration_min               -0.0428898086 -0.012326106  0.003293255
## budget_million              0.0041827498 -0.021969857  0.013043434
## revenue_million             0.0016225631 -0.013533906  0.014207714
## popularity_score           -0.0129252097 -0.018545661  0.022628176
## awards_won                  0.0208571400 -0.047914174 -0.012863168
## critic_score               -0.0054956357  0.007692713 -0.021702091
## user_score                  0.0060326496  0.033200573 -0.024237762
## release_month               0.0297635860 -0.007988467  0.009563739
## weekend_collection_million -0.0270523152 -0.023227893 -0.004183373
## profit_million              0.0003859642 -0.006857652  0.010021066
## roi                         0.0087078458  0.004399491 -0.023535097
##                             duration_min budget_million revenue_million
## year                       -0.0428898086     0.00418275    0.0016225631
## rating                     -0.0123261058    -0.02196986   -0.0135339061
## votes                       0.0032932554     0.01304343    0.0142077141
## duration_min                1.0000000000     0.06067780    0.0342986944
## budget_million              0.0606777952     1.00000000    0.0154235583
## revenue_million             0.0342986944     0.01542356    1.0000000000
## popularity_score           -0.0029036704    -0.03336210   -0.0560458608
## awards_won                 -0.0003804844    -0.01780206    0.0024063133
## critic_score                0.0049251697    -0.02567229    0.0253768063
## user_score                  0.0116550324    -0.01955360    0.0205360943
## release_month              -0.0139357431    -0.05142280   -0.0229659124
## weekend_collection_million  0.0265800853    -0.01057077   -0.0002487916
## profit_million              0.0159710427    -0.26676331    0.9595329946
## roi                        -0.0035750553    -0.31366016    0.1861823632
##                            popularity_score    awards_won  critic_score
## year                            -0.01292521  0.0208571400 -0.0054956357
## rating                          -0.01854566 -0.0479141739  0.0076927131
## votes                            0.02262818 -0.0128631678 -0.0217020910
## duration_min                    -0.00290367 -0.0003804844  0.0049251697
## budget_million                  -0.03336210 -0.0178020600 -0.0256722940
## revenue_million                 -0.05604586  0.0024063133  0.0253768063
## popularity_score                 1.00000000  0.0203571603 -0.0497163761
## awards_won                       0.02035716  1.0000000000  0.0356473586
## critic_score                    -0.04971638  0.0356473586  1.0000000000
## user_score                      -0.01079356  0.0142911594  0.0007226204
## release_month                   -0.02471972 -0.0170061241  0.0703392690
## weekend_collection_million       0.01510772  0.0116372824  0.0147078267
## profit_million                  -0.04462554  0.0073329786  0.0316901940
## roi                             -0.01852617  0.0152531612 -0.0103211339
##                               user_score release_month
## year                        0.0060326496   0.029763586
## rating                      0.0332005730  -0.007988467
## votes                      -0.0242377618   0.009563739
## duration_min                0.0116550324  -0.013935743
## budget_million             -0.0195535991  -0.051422798
## revenue_million             0.0205360943  -0.022965912
## popularity_score           -0.0107935637  -0.024719719
## awards_won                  0.0142911594  -0.017006124
## critic_score                0.0007226204   0.070339269
## user_score                  1.0000000000  -0.001271712
## release_month              -0.0012717116   1.000000000
## weekend_collection_million  0.0020579331  -0.004359497
## profit_million              0.0253011380  -0.007654120
## roi                         0.0159507822   0.002817174
##                            weekend_collection_million profit_million
## year                                    -0.0270523152   0.0003859642
## rating                                  -0.0232278927  -0.0068576524
## votes                                   -0.0041833728   0.0100210664
## duration_min                             0.0265800853   0.0159710427
## budget_million                          -0.0105707702  -0.2667633129
## revenue_million                         -0.0002487916   0.9595329946
## popularity_score                         0.0151077187  -0.0446255407
## awards_won                               0.0116372824   0.0073329786
## critic_score                             0.0147078267   0.0316901940
## user_score                               0.0020579331   0.0253011380
## release_month                           -0.0043594971  -0.0076541201
## weekend_collection_million               1.0000000000   0.0027372387
## profit_million                           0.0027372387   1.0000000000
## roi                                      0.0096396280   0.2677928694
##                                     roi
## year                        0.008707846
## rating                      0.004399491
## votes                      -0.023535097
## duration_min               -0.003575055
## budget_million             -0.313660156
## revenue_million             0.186182363
## popularity_score           -0.018526166
## awards_won                  0.015253161
## critic_score               -0.010321134
## user_score                  0.015950782
## release_month               0.002817174
## weekend_collection_million  0.009639628
## profit_million              0.267792869
## roi                         1.000000000
res$P
##                                  year     rating     votes duration_min
## year                               NA 0.69005382 0.8857923  0.055139691
## rating                     0.69005382         NA 0.4208861  0.581690909
## votes                      0.88579230 0.42088612        NA  0.882984277
## duration_min               0.05513969 0.58169091 0.8829843           NA
## budget_million             0.85170601 0.32608697 0.5599066  0.006639629
## revenue_million            0.94218975 0.54524279 0.5254133  0.125183132
## popularity_score           0.56346958 0.40713839 0.3117965  0.896744303
## awards_won                 0.35119292 0.03213896 0.5653437  0.986432514
## critic_score               0.80597602 0.73098271 0.3320193  0.825774647
## user_score                 0.78745101 0.13774110 0.2786189  0.602421997
## release_month              0.18334350 0.72106378 0.6690547  0.533372585
## weekend_collection_million 0.22655481 0.29914270 0.8516842  0.234768681
## profit_million             0.98623713 0.75922747 0.6542349  0.475322574
## roi                        0.69713514 0.84411776 0.2927953  0.873052890
##                            budget_million revenue_million popularity_score
## year                          0.851706007      0.94218975       0.56346958
## rating                        0.326086967      0.54524279       0.40713839
## votes                         0.559906630      0.52541334       0.31179649
## duration_min                  0.006639629      0.12518313       0.89674430
## budget_million                         NA      0.49058903       0.13583524
## revenue_million               0.490589026              NA       0.01218122
## popularity_score              0.135835240      0.01218122               NA
## awards_won                    0.426207792      0.91435545       0.36286206
## critic_score                  0.251145605      0.25664422       0.02619094
## user_score                    0.382118040      0.35865830       0.62951108
## release_month                 0.021460614      0.30462764       0.26916950
## weekend_collection_million    0.636600721      0.99112823       0.49951449
## profit_million                0.000000000      0.00000000       0.04599260
## roi                           0.000000000      0.00000000       0.40763175
##                            awards_won critic_score user_score release_month
## year                       0.35119292  0.805976015  0.7874510   0.183343501
## rating                     0.03213896  0.730982715  0.1377411   0.721063775
## votes                      0.56534371  0.332019333  0.2786189   0.669054672
## duration_min               0.98643251  0.825774647  0.6024220   0.533372585
## budget_million             0.42620779  0.251145605  0.3821180   0.021460614
## revenue_million            0.91435545  0.256644225  0.3586583   0.304627644
## popularity_score           0.36286206  0.026190938  0.6295111   0.269169501
## awards_won                         NA  0.111001410  0.5229835   0.447184961
## critic_score               0.11100141           NA  0.9742357   0.001646167
## user_score                 0.52298354  0.974235706         NA   0.954674930
## release_month              0.44718496  0.001646167  0.9546749            NA
## weekend_collection_million 0.60297483  0.510937670  0.9267170   0.845516893
## profit_million             0.74310814  0.156571141  0.2580656   0.732280402
## roi                        0.49539369  0.644584203  0.4758829   0.899803700
##                            weekend_collection_million profit_million       roi
## year                                        0.2265548      0.9862371 0.6971351
## rating                                      0.2991427      0.7592275 0.8441178
## votes                                       0.8516842      0.6542349 0.2927953
## duration_min                                0.2347687      0.4753226 0.8730529
## budget_million                              0.6366007      0.0000000 0.0000000
## revenue_million                             0.9911282      0.0000000 0.0000000
## popularity_score                            0.4995145      0.0459926 0.4076318
## awards_won                                  0.6029748      0.7431081 0.4953937
## critic_score                                0.5109377      0.1565711 0.6445842
## user_score                                  0.9267170      0.2580656 0.4758829
## release_month                               0.8455169      0.7322804 0.8998037
## weekend_collection_million                         NA      0.9026323 0.6665863
## profit_million                              0.9026323             NA 0.0000000
## roi                                         0.6665863      0.0000000        NA

Answer: p-values for correlation.


6.0.4 Q23: Heatmap

num <- data[, c("rating", "votes", "budget_million",
 "revenue_million", "profit_million", "popularity_score")]
corr <- cor(num, use = "complete.obs")
corr_df <- as.data.frame(as.table(corr))
ggplot(corr_df, aes(Var1, Var2, fill = Freq)) +
 geom_tile() +
 scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
 labs(title = "Correlation Heatmap", x = "", y = "") +
 theme(axis.text.x = element_text(angle = 45, hjust = 1))

Answer: Visual correlation.


7 Unit VI: Regression

7.0.1 Q24: Perform Multiple Linear Regression and find R² & F-value

model_better <- lm(revenue_million ~ budget_million + rating + popularity_score +
                   votes + duration_min + awards_won, data = data)

summary(model_better)
## 
## Call:
## lm(formula = revenue_million ~ budget_million + rating + popularity_score + 
##     votes + duration_min + awards_won, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -517.69 -258.77   -4.04  250.00  544.74 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.830e+02  4.317e+01  11.187   <2e-16 ***
## budget_million    3.798e-02  7.670e-02   0.495    0.621    
## rating           -2.496e+00  4.141e+00  -0.603    0.547    
## popularity_score -5.693e-01  2.265e-01  -2.514    0.012 *  
## votes             3.021e-06  4.495e-06   0.672    0.502    
## duration_min      2.791e-01  1.879e-01   1.485    0.138    
## awards_won        1.672e-01  1.131e+00   0.148    0.883    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 289.9 on 1993 degrees of freedom
## Multiple R-squared:  0.004867,   Adjusted R-squared:  0.001872 
## F-statistic: 1.625 on 6 and 1993 DF,  p-value: 0.1363
# Extract values
r2 <- summary(model_better)$r.squared
adj_r2 <- summary(model_better)$adj.r.squared
f_val <- summary(model_better)$fstatistic

r2
## [1] 0.004867397
adj_r2
## [1] 0.001871514
f_val
##       value       numdf       dendf 
##    1.624695    6.000000 1993.000000

Answer: R² shows how much variance in revenue is explained by variables Adjusted R² improves accuracy by penalizing extra variables F-value checks overall model significance


7.0.2 Q25: Multiple regression

model_multi <- lm(revenue_million ~ budget_million + rating + popularity_score, data=data)
summary(model_multi)
## 
## Call:
## lm(formula = revenue_million ~ budget_million + rating + popularity_score, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -507.13 -257.28   -2.14  249.36  541.29 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      525.07153   33.19178  15.819   <2e-16 ***
## budget_million     0.04533    0.07653   0.592   0.5537    
## rating            -2.64153    4.13519  -0.639   0.5230    
## popularity_score  -0.56554    0.22634  -2.499   0.0125 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 289.9 on 1996 degrees of freedom
## Multiple R-squared:  0.003529,   Adjusted R-squared:  0.002031 
## F-statistic: 2.356 on 3 and 1996 DF,  p-value: 0.07008

Answer: Multi-variable model.


7.0.3 Q26: Polynomial regression

data$budget_sq <- data$budget_million^2

model_poly <- lm(revenue_million ~ budget_million + budget_sq, data=data)
summary(model_poly)
## 
## Call:
## lm(formula = revenue_million ~ budget_million + budget_sq, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -497.52 -257.83   -0.38  251.13  518.61 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.871e+02  2.048e+01  23.789   <2e-16 ***
## budget_million -1.282e-01  3.079e-01  -0.416    0.677    
## budget_sq       5.951e-04  9.805e-04   0.607    0.544    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 290.2 on 1997 degrees of freedom
## Multiple R-squared:  0.0004223,  Adjusted R-squared:  -0.0005788 
## F-statistic: 0.4218 on 2 and 1997 DF,  p-value: 0.6559

Answer: Non-linear relationship.


7.0.4 Q27: Diagnostics

par(mfrow=c(2,2))
plot(model_multi)

Answer: Model validation.


7.0.5 Q28: Predict Revenue using Model

new_data <- data.frame(
  budget_million = c(50, 200),
  rating = c(7, 8),
  popularity_score = c(60, 90)
)

predict(model_multi, newdata = new_data)
##        1        2 
## 474.9154 462.1080

Answer:

Model predicts revenue for new movies based on input values


7.0.6 Q29: Train-test split

set.seed(123)
idx <- sample(1:nrow(data), 0.7*nrow(data))

train <- data[idx,]
test <- data[-idx,]

Answer: Data splitting.


7.0.7 Q30: Cross validation

library(caret)

train_control <- trainControl(method = "cv", number = 5)

caret_model <- train(
  revenue_million ~ budget_million + rating + popularity_score,
  data = data,
  method = "lm",
  trControl = train_control
)

caret_model
## Linear Regression 
## 
## 2000 samples
##    3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1600, 1600, 1600, 1600, 1600 
## Resampling results:
## 
##   RMSE      Rsquared     MAE     
##   289.9708  0.001997352  251.5352
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Answer: ross-validation improves model reliability by testing on multiple data splits