Content
This dataset consists of the following files:
movies_metadata.csv: The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.
links.csv: The file that contains the TMDB and IMDB IDs of all the movies featured in the Full MovieLens dataset.
ratings.csv: It has more than 100,000 ratings from more than 1000’s users.
Cleaning the data is necessary to handle unsuported CSV parameters in Postgress. I Trim the data and work on long character format to Eliminate unwanted symbols and separate needed strings.
# load library needed in R workspace
library(stringr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DBI)
library(RODBC)
library(odbc)
library(RPostgres)
data <- read.csv("movies_metadata.csv", header = T)
head(data)
## id imdb_id adult budget
## 1 862 tt0114709 FALSE 30000000
## 2 8844 tt0113497 FALSE 65000000
## 3 15602 tt0113228 FALSE 0
## 4 31357 tt0114885 FALSE 16000000
## 5 11862 tt0113041 FALSE 0
## 6 949 tt0113277 FALSE 60000000
## genres
## 1 [{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
## 2 [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]
## 3 [{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]
## 4 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]
## 5 [{'id': 35, 'name': 'Comedy'}]
## 6 [{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}]
## homepage original_language
## 1 http://toystory.disney.com/toy-story en
## 2 en
## 3 en
## 4 en
## 5 en
## 6 en
## original_title popularity
## 1 Toy Story 21.946943
## 2 Jumanji 17.015539
## 3 Grumpier Old Men 11.7129
## 4 Waiting to Exhale 3.859495
## 5 Father of the Bride Part II 8.387519
## 6 Heat 17.924927
## production_companies
## 1 [{'name': 'Pixar Animation Studios', 'id': 3}]
## 2 [{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]
## 3 [{'name': 'Warner Bros.', 'id': 6194}, {'name': 'Lancaster Gate', 'id': 19464}]
## 4 [{'name': 'Twentieth Century Fox Film Corporation', 'id': 306}]
## 5 [{'name': 'Sandollar Productions', 'id': 5842}, {'name': 'Touchstone Pictures', 'id': 9195}]
## 6 [{'name': 'Regency Enterprises', 'id': 508}, {'name': 'Forward Pass', 'id': 675}, {'name': 'Warner Bros.', 'id': 6194}]
## production_countries release_date
## 1 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 10/30/1995
## 2 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 12/15/1995
## 3 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 12/22/1995
## 4 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 12/22/1995
## 5 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 2/10/1995
## 6 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 12/15/1995
## revenue runtime
## 1 373554033 81
## 2 262797249 104
## 3 0 101
## 4 81452156 127
## 5 76578911 106
## 6 187436818 170
## spoken_languages
## 1 [{'iso_639_1': 'en', 'name': 'English'}]
## 2 [{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]
## 3 [{'iso_639_1': 'en', 'name': 'English'}]
## 4 [{'iso_639_1': 'en', 'name': 'English'}]
## 5 [{'iso_639_1': 'en', 'name': 'English'}]
## 6 [{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'es', 'name': 'Español'}]
## status title video vote_average vote_count
## 1 Released Toy Story FALSE 7.7 5415
## 2 Released Jumanji FALSE 6.9 2413
## 3 Released Grumpier Old Men FALSE 6.5 92
## 4 Released Waiting to Exhale FALSE 6.1 34
## 5 Released Father of the Bride Part II FALSE 5.7 173
## 6 Released Heat FALSE 7.7 1886
String manipulation needed before creating the database. Take a look at the “” [{‘id’: 16, ‘name’: ‘Animation’}, {‘id’: 35, ‘name’: ‘Comedy’}, {‘id’: 10751, ‘name’: ‘Family’}] “” .
# renaming the variable and
movie_dataset <- data %>% mutate(genres= str_match(data$genres, "[A-Z][a-z]+"),
production_companies = str_match(data$production_companies, "[A-Z][a-z]+ [A-Z][a-z]++"),
production_countries = str_extract(data$production_countries, "[A-Z][a-z]+ [A-Z][a-z]++"),
spoken_languages = str_match(data$spoken_languages, "[A-Z][a-z]+")
) %>% distinct()
head(movie_dataset)
## id imdb_id adult budget genres homepage
## 1 862 tt0114709 FALSE 30000000 Animation http://toystory.disney.com/toy-story
## 2 8844 tt0113497 FALSE 65000000 Adventure
## 3 15602 tt0113228 FALSE 0 Romance
## 4 31357 tt0114885 FALSE 16000000 Comedy
## 5 11862 tt0113041 FALSE 0 Comedy
## 6 949 tt0113277 FALSE 60000000 Action
## original_language original_title popularity
## 1 en Toy Story 21.946943
## 2 en Jumanji 17.015539
## 3 en Grumpier Old Men 11.7129
## 4 en Waiting to Exhale 3.859495
## 5 en Father of the Bride Part II 8.387519
## 6 en Heat 17.924927
## production_companies production_countries release_date revenue runtime
## 1 Pixar Animation United States 10/30/1995 373554033 81
## 2 Star Pictures United States 12/15/1995 262797249 104
## 3 Warner Bros United States 12/22/1995 0 101
## 4 Twentieth Century United States 12/22/1995 81452156 127
## 5 Sandollar Productions United States 2/10/1995 76578911 106
## 6 Regency Enterprises United States 12/15/1995 187436818 170
## spoken_languages status title video vote_average
## 1 English Released Toy Story FALSE 7.7
## 2 English Released Jumanji FALSE 6.9
## 3 English Released Grumpier Old Men FALSE 6.5
## 4 English Released Waiting to Exhale FALSE 6.1
## 5 English Released Father of the Bride Part II FALSE 5.7
## 6 English Released Heat FALSE 7.7
## vote_count
## 1 5415
## 2 2413
## 3 92
## 4 34
## 5 173
## 6 1886
This code is grey-out because it will overide the file in my computer. this is the next step before moving the file to PostGres PGAdmin 4 : write.table(movie_dataset,“movies_metadata_clean.txt”, sep = “, row.names = F)
Let run the needed libraries first
# database library
library(DBI)
library(RODBC)
library(odbc)
library(RPostgres)
library(ggplot2)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(kableExtra)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 2 > 1' in coercion to 'logical(1)'
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(gt)
Let connect the database to the PostGres Database. It is important to make sure that the required ODBC driver must be identified and installed in order to access data from any database to another platform.
Best way to add the driver type or copy this in Windows Explore : <C:32.exe>
# Create connection to db
con <- dbConnect(odbc::odbc(), "PostgreSQL30", timeout = 10)
dbListTables(con)
## [1] "film" "links" "movie_ratings" "movies"
## [5] "movies_2" "movies_final" "movies_main" "movies_metadata"
## [9] "ratings"