library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.2.0     âś” readr     2.2.0
## âś” forcats   1.0.1     âś” stringr   1.6.0
## âś” ggplot2   4.0.3     âś” tibble    3.3.1
## âś” lubridate 1.9.5     âś” tidyr     1.3.2
## âś” purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
movies <- read.csv("movies_metadata.csv")
credits <- read_csv("credits.csv")
## Rows: 45476 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cast, crew
## dbl (1): id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ratings <- read_csv("ratings_small.csv")
## Rows: 100004 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): userId, movieId, rating, timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(movies)
## Rows: 45,466
## Columns: 24
## $ adult                 <chr> "False", "False", "False", "False", "False", "Fa…
## $ belongs_to_collection <chr> "{'id': 10194, 'name': 'Toy Story Collection', '…
## $ budget                <chr> "30000000", "65000000", "0", "16000000", "0", "6…
## $ genres                <chr> "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'n…
## $ homepage              <chr> "http://toystory.disney.com/toy-story", "", "", …
## $ id                    <chr> "862", "8844", "15602", "31357", "11862", "949",…
## $ imdb_id               <chr> "tt0114709", "tt0113497", "tt0113228", "tt011488…
## $ original_language     <chr> "en", "en", "en", "en", "en", "en", "en", "en", …
## $ original_title        <chr> "Toy Story", "Jumanji", "Grumpier Old Men", "Wai…
## $ overview              <chr> "Led by Woody, Andy's toys live happily in his r…
## $ popularity            <chr> "21.946943", "17.015539", "11.7129", "3.859495",…
## $ poster_path           <chr> "/rhIRbceoE9lR4veEXuwCC2wARtG.jpg", "/vzmL6fP7aP…
## $ production_companies  <chr> "[{'name': 'Pixar Animation Studios', 'id': 3}]"…
## $ production_countries  <chr> "[{'iso_3166_1': 'US', 'name': 'United States of…
## $ release_date          <chr> "1995-10-30", "1995-12-15", "1995-12-22", "1995-…
## $ revenue               <dbl> 373554033, 262797249, 0, 81452156, 76578911, 187…
## $ runtime               <dbl> 81, 104, 101, 127, 106, 170, 127, 97, 106, 130, …
## $ spoken_languages      <chr> "[{'iso_639_1': 'en', 'name': 'English'}]", "[{'…
## $ status                <chr> "Released", "Released", "Released", "Released", …
## $ tagline               <chr> "", "Roll the dice and unleash the excitement!",…
## $ title                 <chr> "Toy Story", "Jumanji", "Grumpier Old Men", "Wai…
## $ video                 <chr> "False", "False", "False", "False", "False", "Fa…
## $ vote_average          <dbl> 7.7, 6.9, 6.5, 6.1, 5.7, 7.7, 6.2, 5.4, 5.5, 6.6…
## $ vote_count            <int> 5415, 2413, 92, 34, 173, 1886, 141, 45, 174, 119…
movies$budget <- as.numeric(movies$budget)
## Warning: NAs introduced by coercion
movies$revenue <- as.numeric(movies$revenue)
movies$budget[movies$budget == 0] <- NA
movies$revenue[movies$revenue == 0] <- NA
movies$release_date <- as.Date(movies$release_date, format="%Y-%m-%d")
movies <- movies[!is.na(movies$release_date), ]
movies$runtime <- as.numeric(movies$runtime)
movies$runtime[movies$runtime <= 30 | movies$runtime > 400] <- NA
movies$vote_average <- as.numeric(movies$vote_average)
movies$vote_count <- as.numeric(movies$vote_count)
movies <- movies[movies$vote_count >= 10, ]
movies$adult <- movies$adult == "TRUE"
movies$video <- movies$video == "TRUE"
movies$id <- as.numeric(movies$id)
glimpse(movies)
## Rows: 22,928
## Columns: 24
## $ adult                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ belongs_to_collection <chr> "{'id': 10194, 'name': 'Toy Story Collection', '…
## $ budget                <dbl> 30000000, 65000000, NA, 16000000, NA, 60000000, …
## $ genres                <chr> "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'n…
## $ homepage              <chr> "http://toystory.disney.com/toy-story", "", "", …
## $ id                    <dbl> 862, 8844, 15602, 31357, 11862, 949, 11860, 4532…
## $ imdb_id               <chr> "tt0114709", "tt0113497", "tt0113228", "tt011488…
## $ original_language     <chr> "en", "en", "en", "en", "en", "en", "en", "en", …
## $ original_title        <chr> "Toy Story", "Jumanji", "Grumpier Old Men", "Wai…
## $ overview              <chr> "Led by Woody, Andy's toys live happily in his r…
## $ popularity            <chr> "21.946943", "17.015539", "11.7129", "3.859495",…
## $ poster_path           <chr> "/rhIRbceoE9lR4veEXuwCC2wARtG.jpg", "/vzmL6fP7aP…
## $ production_companies  <chr> "[{'name': 'Pixar Animation Studios', 'id': 3}]"…
## $ production_countries  <chr> "[{'iso_3166_1': 'US', 'name': 'United States of…
## $ release_date          <date> 1995-10-30, 1995-12-15, 1995-12-22, 1995-12-22,…
## $ revenue               <dbl> 373554033, 262797249, NA, 81452156, 76578911, 18…
## $ runtime               <dbl> 81, 104, 101, 127, 106, 170, 127, 97, 106, 130, …
## $ spoken_languages      <chr> "[{'iso_639_1': 'en', 'name': 'English'}]", "[{'…
## $ status                <chr> "Released", "Released", "Released", "Released", …
## $ tagline               <chr> "", "Roll the dice and unleash the excitement!",…
## $ title                 <chr> "Toy Story", "Jumanji", "Grumpier Old Men", "Wai…
## $ video                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ vote_average          <dbl> 7.7, 6.9, 6.5, 6.1, 5.7, 7.7, 6.2, 5.4, 5.5, 6.6…
## $ vote_count            <dbl> 5415, 2413, 92, 34, 173, 1886, 141, 45, 174, 119…
ratings_summary <- ratings %>% group_by(movieId) %>% summarise(avg_rating = mean(rating), n_votes = n())
movies_ratings <- movies %>% inner_join(ratings_summary, by=c("id"="movieId"))
movies_ratings_clean <- movies_ratings %>% filter(!is.na(avg_rating), !is.na(revenue))
ggplot(movies_ratings_clean, aes(x=avg_rating, y=revenue)) + geom_point(alpha=0.5) + geom_smooth(method="lm", col="purple") +  ggtitle("Revenue vs Average Rating") + theme_light()
## `geom_smooth()` using formula = 'y ~ x'

genres_rating <- movies_ratings %>% filter(!is.na(genres), !is.na(avg_rating)) %>% separate_rows(genres, sep=",") %>% filter(!grepl("id", genres, ignore.case=TRUE)) %>% group_by(genres) %>% summarise(avg_rating = mean(avg_rating, na.rm=TRUE)) %>% arrange(desc(avg_rating))
head(genres_rating, 5)
## # A tibble: 5 Ă— 2
##   genres                   avg_rating
##   <chr>                         <dbl>
## 1 " 'name': 'Western'}"          3.63
## 2 " 'name': 'TV Movie'}]"        3.60
## 3 " 'name': 'Animation'}]"       3.50
## 4 " 'name': 'Mystery'}]"         3.49
## 5 " 'name': 'Music'}"            3.49
movies_ratings <- movies_ratings %>% mutate(success = ifelse(revenue >= 2.5*budget & avg_rating >= 3.5, "Successful", "Unsuccessful"))
table(movies_ratings$success)
## 
##   Successful Unsuccessful 
##          282         1310
format(summary(movies_ratings$revenue), scientific=FALSE)
##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## "         5" "  12136938" "  50000000" " 114545574" " 141774679" "1845034188" 
##         NA's 
##       "1009"
format(summary(movies_ratings$budget), scientific=FALSE)
##        Min.     1st Qu.      Median        Mean     3rd Qu.        Max. 
## "        1" "  4000000" " 18000000" " 32169782" " 44000000" "380000000" 
##        NA's 
##       "969"
cor(movies_ratings$budget, movies_ratings$revenue, use="complete.obs")
## [1] 0.6921359
cor(movies_ratings$avg_rating, movies_ratings$revenue, use="complete.obs")
## [1] -0.01698542
movies_by_year <- movies %>% filter(!is.na(revenue), !is.na(release_date)) %>% mutate(year = as.numeric(substr(release_date, 1, 4))) %>% group_by(year) %>% summarise(avg_revenue = mean(revenue, na.rm=TRUE))
ggplot(movies_by_year, aes(x=year, y=avg_revenue)) + geom_line(color="pink") + ggtitle("Average Movie Revenue by Year") + theme_light()

ggplot(movies_ratings, aes(x=success, y=revenue, fill=success)) + geom_boxplot() + ggtitle("Revenue Distribution: Successful vs Unsuccessful Movies") + theme_light()
## Warning: Removed 1009 rows containing non-finite outside the scale range
## (`stat_boxplot()`).