library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(assertr)
library(ggplot2)
library(lubridate)
##
## Adjuntando el paquete: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readxl)
# Instalar y cargar el paquete necesario (si no lo has hecho previamente)
library(readxl)
# Seleccionar el archivo usando un cuadro de diálogo
ruta_archivo <- file.choose()
# Leer el archivo Excel seleccionado
dfmovies<- read.csv(ruta_archivo)
# Si es necesario, convertir a data frame
dfmovies <- as.data.frame(dfmovies)
head(dfmovies)
## adult
## 1 False
## 2 False
## 3 False
## 4 False
## 5 False
## 6 False
## belongs_to_collection
## 1 {'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}
## 2
## 3 {'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg'}
## 4
## 5 {'id': 96871, 'name': 'Father of the Bride Collection', 'poster_path': '/nts4iOmNnq7GNicycMJ9pSAn204.jpg', 'backdrop_path': '/7qwE57OVZmMJChBpLEbJEmzUydk.jpg'}
## 6
## budget
## 1 30000000
## 2 65000000
## 3 0
## 4 16000000
## 5 0
## 6 60000000
## genres
## 1 [{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
## 2 [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]
## 3 [{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]
## 4 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]
## 5 [{'id': 35, 'name': 'Comedy'}]
## 6 [{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}]
## homepage id imdb_id original_language
## 1 http://toystory.disney.com/toy-story 862 tt0114709 en
## 2 8844 tt0113497 en
## 3 15602 tt0113228 en
## 4 31357 tt0114885 en
## 5 11862 tt0113041 en
## 6 949 tt0113277 en
## original_title
## 1 Toy Story
## 2 Jumanji
## 3 Grumpier Old Men
## 4 Waiting to Exhale
## 5 Father of the Bride Part II
## 6 Heat
## overview
## 1 Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
## 2 When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.
## 3 A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.
## 4 Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive "good man" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe.
## 5 Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.
## 6 Obsessive master thief, Neil McCauley leads a top-notch crew on various insane heists throughout Los Angeles while a mentally unstable detective, Vincent Hanna pursues him without rest. Each man recognizes and respects the ability and the dedication of the other even though they are aware their cat-and-mouse game may end in violence.
## popularity poster_path
## 1 21.946943 /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
## 2 17.015539 /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
## 3 11.7129 /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg
## 4 3.859495 /16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg
## 5 8.387519 /e64sOI48hQXyru7naBFyssKFxVd.jpg
## 6 17.924927 /zMyfPUelumio3tiDKPffaUpsQTD.jpg
## production_companies
## 1 [{'name': 'Pixar Animation Studios', 'id': 3}]
## 2 [{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]
## 3 [{'name': 'Warner Bros.', 'id': 6194}, {'name': 'Lancaster Gate', 'id': 19464}]
## 4 [{'name': 'Twentieth Century Fox Film Corporation', 'id': 306}]
## 5 [{'name': 'Sandollar Productions', 'id': 5842}, {'name': 'Touchstone Pictures', 'id': 9195}]
## 6 [{'name': 'Regency Enterprises', 'id': 508}, {'name': 'Forward Pass', 'id': 675}, {'name': 'Warner Bros.', 'id': 6194}]
## production_countries release_date
## 1 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 1995-10-30
## 2 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 1995-12-15
## 3 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 1995-12-22
## 4 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 1995-12-22
## 5 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 1995-02-10
## 6 [{'iso_3166_1': 'US', 'name': 'United States of America'}] 1995-12-15
## revenue runtime
## 1 373554033 81
## 2 262797249 104
## 3 0 101
## 4 81452156 127
## 5 76578911 106
## 6 187436818 170
## spoken_languages
## 1 [{'iso_639_1': 'en', 'name': 'English'}]
## 2 [{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]
## 3 [{'iso_639_1': 'en', 'name': 'English'}]
## 4 [{'iso_639_1': 'en', 'name': 'English'}]
## 5 [{'iso_639_1': 'en', 'name': 'English'}]
## 6 [{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'es', 'name': 'Español'}]
## status
## 1 Released
## 2 Released
## 3 Released
## 4 Released
## 5 Released
## 6 Released
## tagline
## 1
## 2 Roll the dice and unleash the excitement!
## 3 Still Yelling. Still Fighting. Still Ready for Love.
## 4 Friends are the people who let you be yourself... and never let you forget it.
## 5 Just When His World Is Back To Normal... He's In For The Surprise Of His Life!
## 6 A Los Angeles Crime Saga
## title video vote_average vote_count
## 1 Toy Story False 7.7 5415
## 2 Jumanji False 6.9 2413
## 3 Grumpier Old Men False 6.5 92
## 4 Waiting to Exhale False 6.1 34
## 5 Father of the Bride Part II False 5.7 173
## 6 Heat False 7.7 1886
glimpse(dfmovies)
## Rows: 45,466
## Columns: 24
## $ adult <chr> "False", "False", "False", "False", "False", "Fa…
## $ belongs_to_collection <chr> "{'id': 10194, 'name': 'Toy Story Collection', '…
## $ budget <chr> "30000000", "65000000", "0", "16000000", "0", "6…
## $ genres <chr> "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'n…
## $ homepage <chr> "http://toystory.disney.com/toy-story", "", "", …
## $ id <chr> "862", "8844", "15602", "31357", "11862", "949",…
## $ imdb_id <chr> "tt0114709", "tt0113497", "tt0113228", "tt011488…
## $ original_language <chr> "en", "en", "en", "en", "en", "en", "en", "en", …
## $ original_title <chr> "Toy Story", "Jumanji", "Grumpier Old Men", "Wai…
## $ overview <chr> "Led by Woody, Andy's toys live happily in his r…
## $ popularity <chr> "21.946943", "17.015539", "11.7129", "3.859495",…
## $ poster_path <chr> "/rhIRbceoE9lR4veEXuwCC2wARtG.jpg", "/vzmL6fP7aP…
## $ production_companies <chr> "[{'name': 'Pixar Animation Studios', 'id': 3}]"…
## $ production_countries <chr> "[{'iso_3166_1': 'US', 'name': 'United States of…
## $ release_date <chr> "1995-10-30", "1995-12-15", "1995-12-22", "1995-…
## $ revenue <dbl> 373554033, 262797249, 0, 81452156, 76578911, 187…
## $ runtime <dbl> 81, 104, 101, 127, 106, 170, 127, 97, 106, 130, …
## $ spoken_languages <chr> "[{'iso_639_1': 'en', 'name': 'English'}]", "[{'…
## $ status <chr> "Released", "Released", "Released", "Released", …
## $ tagline <chr> "", "Roll the dice and unleash the excitement!",…
## $ title <chr> "Toy Story", "Jumanji", "Grumpier Old Men", "Wai…
## $ video <chr> "False", "False", "False", "False", "False", "Fa…
## $ vote_average <dbl> 7.7, 6.9, 6.5, 6.1, 5.7, 7.7, 6.2, 5.4, 5.5, 6.6…
## $ vote_count <int> 5415, 2413, 92, 34, 173, 1886, 141, 45, 174, 119…
summary(dfmovies)
## adult belongs_to_collection budget genres
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## homepage id imdb_id original_language
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## original_title overview popularity poster_path
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## production_companies production_countries release_date
## Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## revenue runtime spoken_languages status
## Min. :0.000e+00 Min. : 0.00 Length:45466 Length:45466
## 1st Qu.:0.000e+00 1st Qu.: 85.00 Class :character Class :character
## Median :0.000e+00 Median : 95.00 Mode :character Mode :character
## Mean :1.121e+07 Mean : 94.13
## 3rd Qu.:0.000e+00 3rd Qu.: 107.00
## Max. :2.788e+09 Max. :1256.00
## NA's :6 NA's :263
## tagline title video vote_average
## Length:45466 Length:45466 Length:45466 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 5.000
## Mode :character Mode :character Mode :character Median : 6.000
## Mean : 5.618
## 3rd Qu.: 6.800
## Max. :10.000
## NA's :6
## vote_count
## Min. : 0.0
## 1st Qu.: 3.0
## Median : 10.0
## Mean : 109.9
## 3rd Qu.: 34.0
## Max. :14075.0
## NA's :6
dfmovies <- dfmovies %>%
mutate(budget = as.numeric(str_replace_all(budget, "[^0-9]", "")),
revenue = as.numeric(str_replace_all(revenue, "[^0-9]", "")))
sum(is.na(dfmovies$budget))
## [1] 0
sum(is.na(dfmovies$revenue))
## [1] 6
dfmovies <- dfmovies %>%
mutate(vote_average = as.numeric(vote_average),
vote_count = as.integer(vote_count))
summary(dfmovies)
## adult belongs_to_collection budget
## Length:45466 Length:45466 Min. : 0
## Class :character Class :character 1st Qu.: 0
## Mode :character Mode :character Median : 0
## Mean : 4225764
## 3rd Qu.: 0
## Max. :380000000
##
## genres homepage id imdb_id
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## original_language original_title overview popularity
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## poster_path production_companies production_countries
## Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## release_date revenue runtime spoken_languages
## Length:45466 Min. :0.000e+00 Min. : 0.00 Length:45466
## Class :character 1st Qu.:0.000e+00 1st Qu.: 85.00 Class :character
## Mode :character Median :0.000e+00 Median : 95.00 Mode :character
## Mean :1.064e+07 Mean : 94.13
## 3rd Qu.:0.000e+00 3rd Qu.: 107.00
## Max. :2.788e+09 Max. :1256.00
## NA's :6 NA's :263
## status tagline title video
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## vote_average vote_count
## Min. : 0.000 Min. : 0.0
## 1st Qu.: 5.000 1st Qu.: 3.0
## Median : 6.000 Median : 10.0
## Mean : 5.618 Mean : 109.9
## 3rd Qu.: 6.800 3rd Qu.: 34.0
## Max. :10.000 Max. :14075.0
## NA's :6 NA's :6
dfmovies <- dfmovies %>%
mutate(vote_average = as.numeric(vote_average),
vote_count = as.integer(vote_count))
summary(dfmovies)
## adult belongs_to_collection budget
## Length:45466 Length:45466 Min. : 0
## Class :character Class :character 1st Qu.: 0
## Mode :character Mode :character Median : 0
## Mean : 4225764
## 3rd Qu.: 0
## Max. :380000000
##
## genres homepage id imdb_id
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## original_language original_title overview popularity
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## poster_path production_companies production_countries
## Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## release_date revenue runtime spoken_languages
## Length:45466 Min. :0.000e+00 Min. : 0.00 Length:45466
## Class :character 1st Qu.:0.000e+00 1st Qu.: 85.00 Class :character
## Mode :character Median :0.000e+00 Median : 95.00 Mode :character
## Mean :1.064e+07 Mean : 94.13
## 3rd Qu.:0.000e+00 3rd Qu.: 107.00
## Max. :2.788e+09 Max. :1256.00
## NA's :6 NA's :263
## status tagline title video
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## vote_average vote_count
## Min. : 0.000 Min. : 0.0
## 1st Qu.: 5.000 1st Qu.: 3.0
## Median : 6.000 Median : 10.0
## Mean : 5.618 Mean : 109.9
## 3rd Qu.: 6.800 3rd Qu.: 34.0
## Max. :10.000 Max. :14075.0
## NA's :6 NA's :6
dfmovies <- dfmovies %>%
mutate(release_date = as.Date(release_date, format = "%Y-%m-%d"))
summary(dfmovies)
## adult belongs_to_collection budget
## Length:45466 Length:45466 Min. : 0
## Class :character Class :character 1st Qu.: 0
## Mode :character Mode :character Median : 0
## Mean : 4225764
## 3rd Qu.: 0
## Max. :380000000
##
## genres homepage id imdb_id
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## original_language original_title overview popularity
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## poster_path production_companies production_countries
## Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## release_date revenue runtime spoken_languages
## Min. :1874-12-09 Min. :0.000e+00 Min. : 0.00 Length:45466
## 1st Qu.:1978-10-06 1st Qu.:0.000e+00 1st Qu.: 85.00 Class :character
## Median :2001-08-30 Median :0.000e+00 Median : 95.00 Mode :character
## Mean :1992-05-15 Mean :1.064e+07 Mean : 94.13
## 3rd Qu.:2010-12-17 3rd Qu.:0.000e+00 3rd Qu.: 107.00
## Max. :2020-12-16 Max. :2.788e+09 Max. :1256.00
## NA's :90 NA's :6 NA's :263
## status tagline title video
## Length:45466 Length:45466 Length:45466 Length:45466
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## vote_average vote_count
## Min. : 0.000 Min. : 0.0
## 1st Qu.: 5.000 1st Qu.: 3.0
## Median : 6.000 Median : 10.0
## Mean : 5.618 Mean : 109.9
## 3rd Qu.: 6.800 3rd Qu.: 34.0
## Max. :10.000 Max. :14075.0
## NA's :6 NA's :6
summary(dfmovies$runtime)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 85.00 95.00 94.13 107.00 1256.00 263
ggplot(dfmovies, aes(runtime)) +
geom_histogram(binwidth = 10, fill = "blue", color = "black")
## Warning: Removed 263 rows containing non-finite outside the scale range
## (`stat_bin()`).

dfmovies <- dfmovies %>%
mutate(runtime = replace(runtime, runtime > 300, NA))
summary(dfmovies$runtime)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 85.00 95.00 93.21 107.00 300.00 371
dfmovies <- dfmovies %>%
mutate(runtime = ifelse(is.na(runtime), mean(runtime, na.rm = TRUE), runtime))
sum(duplicated(dfmovies))
## [1] 17
dfmovies <- distinct(dfmovies)
sum(duplicated(dfmovies))
## [1] 0
summary(dfmovies$release_date)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "1874-12-09" "1978-10-06" "2001-08-30" "1992-05-16" "2010-12-17" "2020-12-16"
## NA's
## "90"
dfmovies <- dfmovies %>%
filter(!is.na(release_date))
summary(dfmovies)
## adult belongs_to_collection budget
## Length:45359 Length:45359 Min. : 0
## Class :character Class :character 1st Qu.: 0
## Mode :character Mode :character Median : 0
## Mean : 4234169
## 3rd Qu.: 0
## Max. :380000000
## genres homepage id imdb_id
## Length:45359 Length:45359 Length:45359 Length:45359
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## original_language original_title overview popularity
## Length:45359 Length:45359 Length:45359 Length:45359
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## poster_path production_companies production_countries
## Length:45359 Length:45359 Length:45359
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## release_date revenue runtime spoken_languages
## Min. :1874-12-09 Min. :0.000e+00 Min. : 0.00 Length:45359
## 1st Qu.:1978-10-06 1st Qu.:0.000e+00 1st Qu.: 85.00 Class :character
## Median :2001-08-30 Median :0.000e+00 Median : 94.00 Mode :character
## Mean :1992-05-16 Mean :1.066e+07 Mean : 93.27
## 3rd Qu.:2010-12-17 3rd Qu.:0.000e+00 3rd Qu.:106.00
## Max. :2020-12-16 Max. :2.788e+09 Max. :300.00
## status tagline title video
## Length:45359 Length:45359 Length:45359 Length:45359
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## vote_average vote_count
## Min. : 0.000 Min. : 0.0
## 1st Qu.: 5.000 1st Qu.: 3.0
## Median : 6.000 Median : 10.0
## Mean : 5.624 Mean : 110.1
## 3rd Qu.: 6.800 3rd Qu.: 34.0
## Max. :10.000 Max. :14075.0