library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
setwd("~/Desktop/Pankti _ Data Science")
movies_metadata <-read_csv("~/Desktop/Pankti _ Data Science/movies_metadata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## adult = col_logical(),
## budget = col_double(),
## id = col_double(),
## popularity = col_double(),
## release_date = col_date(format = ""),
## revenue = col_double(),
## runtime = col_double(),
## video = col_logical(),
## vote_average = col_double(),
## vote_count = col_double()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 19 parsing failures.
## row col expected actual file
## 19730 NA 24 columns 10 columns '~/Desktop/Pankti _ Data Science/movies_metadata.csv'
## 19731 adult 1/0/T/F/TRUE/FALSE - Written by Ørnås '~/Desktop/Pankti _ Data Science/movies_metadata.csv'
## 19731 budget a double /ff9qCepilowshEtG2GYWwzt2bs4.jpg '~/Desktop/Pankti _ Data Science/movies_metadata.csv'
## 19731 id no trailing characters 1997-08-20 '~/Desktop/Pankti _ Data Science/movies_metadata.csv'
## 19731 release_date date like 1 '~/Desktop/Pankti _ Data Science/movies_metadata.csv'
## ..... ............ ...................... ................................ .....................................................
## See problems(...) for more details.
head(movies_metadata)
## # A tibble: 6 x 24
## adult belongs_to_coll… budget genres homepage id imdb_id original_langua…
## <lgl> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 FALSE {'id': 10194, '… 3.00e7 [{'id… http://… 862 tt0114… en
## 2 FALSE <NA> 6.50e7 [{'id… <NA> 8844 tt0113… en
## 3 FALSE {'id': 119050, … 0. [{'id… <NA> 15602 tt0113… en
## 4 FALSE <NA> 1.60e7 [{'id… <NA> 31357 tt0114… en
## 5 FALSE {'id': 96871, '… 0. [{'id… <NA> 11862 tt0113… en
## 6 FALSE <NA> 6.00e7 [{'id… <NA> 949 tt0113… en
## # … with 16 more variables: original_title <chr>, overview <chr>,
## # popularity <dbl>, poster_path <chr>, production_companies <chr>,
## # production_countries <chr>, release_date <date>, revenue <dbl>,
## # runtime <dbl>, spoken_languages <chr>, status <chr>, tagline <chr>,
## # title <chr>, video <lgl>, vote_average <dbl>, vote_count <dbl>
movies_metadata$Year <- format(movies_metadata$release_date, format="%Y")
head(movies_metadata)
## # A tibble: 6 x 25
## adult belongs_to_coll… budget genres homepage id imdb_id original_langua…
## <lgl> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 FALSE {'id': 10194, '… 3.00e7 [{'id… http://… 862 tt0114… en
## 2 FALSE <NA> 6.50e7 [{'id… <NA> 8844 tt0113… en
## 3 FALSE {'id': 119050, … 0. [{'id… <NA> 15602 tt0113… en
## 4 FALSE <NA> 1.60e7 [{'id… <NA> 31357 tt0114… en
## 5 FALSE {'id': 96871, '… 0. [{'id… <NA> 11862 tt0113… en
## 6 FALSE <NA> 6.00e7 [{'id… <NA> 949 tt0113… en
## # … with 17 more variables: original_title <chr>, overview <chr>,
## # popularity <dbl>, poster_path <chr>, production_companies <chr>,
## # production_countries <chr>, release_date <date>, revenue <dbl>,
## # runtime <dbl>, spoken_languages <chr>, status <chr>, tagline <chr>,
## # title <chr>, video <lgl>, vote_average <dbl>, vote_count <dbl>, Year <chr>
movies_metadata[!duplicated(movies_metadata), ]
## # A tibble: 45,449 x 25
## adult belongs_to_coll… budget genres homepage id imdb_id original_langua…
## <lgl> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 FALSE {'id': 10194, '… 3.00e7 [{'id… http://… 862 tt0114… en
## 2 FALSE <NA> 6.50e7 [{'id… <NA> 8844 tt0113… en
## 3 FALSE {'id': 119050, … 0. [{'id… <NA> 15602 tt0113… en
## 4 FALSE <NA> 1.60e7 [{'id… <NA> 31357 tt0114… en
## 5 FALSE {'id': 96871, '… 0. [{'id… <NA> 11862 tt0113… en
## 6 FALSE <NA> 6.00e7 [{'id… <NA> 949 tt0113… en
## 7 FALSE <NA> 5.80e7 [{'id… <NA> 11860 tt0114… en
## 8 FALSE <NA> 0. [{'id… <NA> 45325 tt0112… en
## 9 FALSE <NA> 3.50e7 [{'id… <NA> 9091 tt0114… en
## 10 FALSE {'id': 645, 'na… 5.80e7 [{'id… http://… 710 tt0113… en
## # … with 45,439 more rows, and 17 more variables: original_title <chr>,
## # overview <chr>, popularity <dbl>, poster_path <chr>,
## # production_companies <chr>, production_countries <chr>,
## # release_date <date>, revenue <dbl>, runtime <dbl>, spoken_languages <chr>,
## # status <chr>, tagline <chr>, title <chr>, video <lgl>, vote_average <dbl>,
## # vote_count <dbl>, Year <chr>
movies_metadata_new <- na.omit(movies_metadata, cols=genres, invert=FALSE)
#view(movies_metadata_new)
head(movies_metadata_new)
## # A tibble: 6 x 25
## adult belongs_to_coll… budget genres homepage id imdb_id original_langua…
## <lgl> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 FALSE {'id': 645, 'na… 5.80e7 [{'id… http://… 710 tt0113… en
## 2 FALSE {'id': 43563, '… 3.50e6 [{'id… http://… 10634 tt0113… en
## 3 FALSE {'id': 10924, '… 1.90e7 [{'id… http://… 755 tt0116… en
## 4 FALSE {'id': 439053, … 2.00e6 [{'id… http://… 5894 tt0112… en
## 5 FALSE {'id': 286162, … 1.50e7 [{'id… http://… 9070 tt0113… en
## 6 FALSE {'id': 182813, … 2.70e4 [{'id… http://… 2292 tt0109… en
## # … with 17 more variables: original_title <chr>, overview <chr>,
## # popularity <dbl>, poster_path <chr>, production_companies <chr>,
## # production_countries <chr>, release_date <date>, revenue <dbl>,
## # runtime <dbl>, spoken_languages <chr>, status <chr>, tagline <chr>,
## # title <chr>, video <lgl>, vote_average <dbl>, vote_count <dbl>, Year <chr>
dfnew<-movies_metadata_new %>%
mutate(profit = revenue - budget)%>%
filter( Year >= "2000")
options(scipen = 999)
library(viridis)
## Loading required package: viridisLite
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
ggplot(dfnew, aes(x = vote_count, y = vote_average)) +
geom_bin2d() +
scale_x_log10(labels = comma) +
scale_y_continuous(breaks = 1:10) +
scale_fill_viridis_c(labels = comma)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 2 rows containing non-finite values (stat_bin2d).
dfnew %>%
group_by(Year) %>%
summarise(title = n()) %>%
ggplot(aes(x = Year, y = title)) +
geom_bar(stat = "identity", color = "Red", fill = "white")+
labs(x = "Release year of movie", y = "Total movie relessed", title = "Movies per year") +
theme_minimal()
library(dplyr)
dfnew %>%
group_by(Year) %>%
mutate(title =n(), profit= mean(profit))%>%
ggplot(aes(x = Year, y = title,color= profit)) +
geom_point()+
labs(x = "Year Movies were released", y = "Count of movies released",title = "Profit generated based on the total movies released per year")+
theme(axis.title = element_text(size = 15, color = "firebrick",
face = "italic"))
theme(legend.title = element_text(size = 15, color = "black",
face = "italic"))
## List of 1
## $ legend.title:List of 11
## ..$ family : NULL
## ..$ face : chr "italic"
## ..$ colour : chr "black"
## ..$ size : num 15
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
Movies_metadata dataset is from Kaggle.com. movies_metadata.csv: The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies,genres. This dataset has categorical variable such as budget, revenue, release_date and I created profit variable to do the analysis.
Cleaning the dataset
I cleaned the dataset in the following steps:
1..Seperated year from the release_date function by using format function 2. Removed the duplicates using !duplicated function 3. Removed rows containing missing values in field genres and created new dataset movie_metadat_new using na.omit function. 4. Created new variable profit 5. Disbale the exponential data for the budget and revenue values using options(scipen = 999) this is inbuilt function which disables the exponential format. 6. Again created new dataset Y_M which I filtered on the year for the movies released >= 2000 for better analysis
Visualizations
Created 3 visualizations :
What is the average rating compared to the number of count - Looking at the graph we get the indication that the average vote is in between 5.5-7.5
How many movies are release per year? – From year 2000- 2005 there is not much bigger gap in terms of number of movies released. Sudden rise occurred from 2009-2013 where 2020 has the highest number of movies released. And then 2017 again shows downfall
Profit per year for movies released - From the graph we see that the in year 2001,2015 and 2017 profit was generated in between 30M- 40M with less numbers of movies being released. The year 2010 generated the profit of only 10M with the highest movies being released in that year
The thing which did not work for me
I wanted to do the analyses taking genres field in particular. The way genres field is given in the dataset needs cleanup. The example is as below:
genres [{‘id’: 16, ‘name’: ‘Animation’}, {‘id’: 35, ‘name’: ‘Comedy’}, {‘id’: 10751, ‘name’: ‘Family’}] [{‘id’: 12, ‘name’: ‘Adventure’}, {‘id’: 14, ‘name’: ‘Fantasy’}, {‘id’: 10751, ‘name’: ‘Family’}]
I tried to convert the data into two fields genre id and genre name having distinct genres but could not find the proper solution. If I would have been successful I could have done some analysis such as movies per genre, top genres per year and profit generated based on the genres.