Since I am a fan of movies, I am going to analyze the movies data-set. I will perform the following steps
My focus will be to analyze the gross and budget for Pirates of the Carribbean Films in the data set
library(readr)
library(tidyr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v dplyr 1.0.4
## v tibble 3.0.6 v stringr 1.4.0
## v purrr 0.3.4 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.4
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(dplyr)
library(ggplot2)
movies_data <- read.csv("https://raw.githubusercontent.com/nathtrish334/Data-607/main/movie_dataset.csv",header=TRUE,sep=",")
head(movies_data[1:20])
## color director_name num_critic_for_reviews duration
## 1 Color James Cameron 723 178
## 2 Color Gore Verbinski 302 169
## 3 Color Sam Mendes 602 148
## 4 Color Christopher Nolan 813 164
## 5 Doug Walker NA NA
## 6 Color Andrew Stanton 462 132
## director_facebook_likes actor_3_facebook_likes actor_2_name
## 1 0 855 Joel David Moore
## 2 563 1000 Orlando Bloom
## 3 0 161 Rory Kinnear
## 4 22000 23000 Christian Bale
## 5 131 NA Rob Walker
## 6 475 530 Samantha Morton
## actor_1_facebook_likes gross genres
## 1 1000 760505847 Action|Adventure|Fantasy|Sci-Fi
## 2 40000 309404152 Action|Adventure|Fantasy
## 3 11000 200074175 Action|Adventure|Thriller
## 4 27000 448130642 Action|Thriller
## 5 131 NA Documentary
## 6 640 73058679 Action|Adventure|Sci-Fi
## actor_1_name movie_title
## 1 CCH Pounder AvatarÂ
## 2 Johnny Depp Pirates of the Caribbean: At World's EndÂ
## 3 Christoph Waltz SpectreÂ
## 4 Tom Hardy The Dark Knight RisesÂ
## 5 Doug Walker Star Wars: Episode VII - The Force AwakensÂ
## 6 Daryl Sabara John CarterÂ
## num_voted_users cast_total_facebook_likes actor_3_name
## 1 886204 4834 Wes Studi
## 2 471220 48350 Jack Davenport
## 3 275868 11700 Stephanie Sigman
## 4 1144337 106759 Joseph Gordon-Levitt
## 5 8 143
## 6 212204 1873 Polly Walker
## facenumber_in_poster
## 1 0
## 2 0
## 3 1
## 4 0
## 5 0
## 6 1
## plot_keywords
## 1 avatar|future|marine|native|paraplegic
## 2 goddess|marriage ceremony|marriage proposal|pirate|singapore
## 3 bomb|espionage|sequel|spy|terrorist
## 4 deception|imprisonment|lawlessness|police officer|terrorist plot
## 5
## 6 alien|american civil war|male nipple|mars|princess
## movie_imdb_link num_user_for_reviews
## 1 http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1 3054
## 2 http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1 1238
## 3 http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1 994
## 4 http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1 2701
## 5 http://www.imdb.com/title/tt5289954/?ref_=fn_tt_tt_1 NA
## 6 http://www.imdb.com/title/tt0401729/?ref_=fn_tt_tt_1 738
## language
## 1 English
## 2 English
## 3 English
## 4 English
## 5
## 6 English
nrow(movies_data) # Total records
## [1] 5043
names(movies_data) # Column names
## [1] "color" "director_name"
## [3] "num_critic_for_reviews" "duration"
## [5] "director_facebook_likes" "actor_3_facebook_likes"
## [7] "actor_2_name" "actor_1_facebook_likes"
## [9] "gross" "genres"
## [11] "actor_1_name" "movie_title"
## [13] "num_voted_users" "cast_total_facebook_likes"
## [15] "actor_3_name" "facenumber_in_poster"
## [17] "plot_keywords" "movie_imdb_link"
## [19] "num_user_for_reviews" "language"
## [21] "country" "content_rating"
## [23] "budget" "title_year"
## [25] "actor_2_facebook_likes" "imdb_score"
## [27] "aspect_ratio" "movie_facebook_likes"
colSums(movies_data == '' | is.na(movies_data)) # columns with blanks and NA
## color director_name num_critic_for_reviews
## 19 104 50
## duration director_facebook_likes actor_3_facebook_likes
## 15 104 23
## actor_2_name actor_1_facebook_likes gross
## 13 7 884
## genres actor_1_name movie_title
## 0 7 0
## num_voted_users cast_total_facebook_likes actor_3_name
## 0 0 23
## facenumber_in_poster plot_keywords movie_imdb_link
## 13 153 0
## num_user_for_reviews language country
## 21 12 5
## content_rating budget title_year
## 303 492 108
## actor_2_facebook_likes imdb_score aspect_ratio
## 13 0 329
## movie_facebook_likes
## 0
gross year has highest NA and/or blank entries = 884
Removing NA’s
movies_data_clean<-na.omit(movies_data)
nrow(movies_data_clean)
## [1] 3801
movie title, budget, and gross for Pirates of the Carribbean films
pirates_data<-movies_data_clean[grep("Pirates of the Caribbean:", movies_data_clean$movie_title), ]
head(pirates_data)
## color director_name num_critic_for_reviews duration
## 2 Color Gore Verbinski 302 169
## 14 Color Gore Verbinski 313 151
## 19 Color Rob Marshall 448 136
## 206 Color Gore Verbinski 271 143
## director_facebook_likes actor_3_facebook_likes actor_2_name
## 2 563 1000 Orlando Bloom
## 14 563 1000 Orlando Bloom
## 19 252 1000 Sam Claflin
## 206 563 1000 Orlando Bloom
## actor_1_facebook_likes gross genres actor_1_name
## 2 40000 309404152 Action|Adventure|Fantasy Johnny Depp
## 14 40000 423032628 Action|Adventure|Fantasy Johnny Depp
## 19 40000 241063875 Action|Adventure|Fantasy Johnny Depp
## 206 40000 305388685 Action|Adventure|Fantasy Johnny Depp
## movie_title num_voted_users
## 2 Pirates of the Caribbean: At World's End 471220
## 14 Pirates of the Caribbean: Dead Man's Chest 522040
## 19 Pirates of the Caribbean: On Stranger Tides 370704
## 206 Pirates of the Caribbean: The Curse of the Black Pearl 809474
## cast_total_facebook_likes actor_3_name facenumber_in_poster
## 2 48350 Jack Davenport 0
## 14 48486 Jack Davenport 2
## 19 54083 Stephen Graham 4
## 206 48184 Jack Davenport 3
## plot_keywords
## 2 goddess|marriage ceremony|marriage proposal|pirate|singapore
## 14 box office hit|giant squid|heart|liar's dice|monster
## 19 blackbeard|captain|pirate|revenge|soldier
## 206 caribbean|curse|governor|pirate|undead
## movie_imdb_link num_user_for_reviews
## 2 http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1 1238
## 14 http://www.imdb.com/title/tt0383574/?ref_=fn_tt_tt_1 1832
## 19 http://www.imdb.com/title/tt1298650/?ref_=fn_tt_tt_1 484
## 206 http://www.imdb.com/title/tt0325980/?ref_=fn_tt_tt_1 2113
## language country content_rating budget title_year actor_2_facebook_likes
## 2 English USA PG-13 3.00e+08 2007 5000
## 14 English USA PG-13 2.25e+08 2006 5000
## 19 English USA PG-13 2.50e+08 2011 11000
## 206 English USA PG-13 1.40e+08 2003 5000
## imdb_score aspect_ratio movie_facebook_likes
## 2 7.1 2.35 0
## 14 7.3 2.35 5000
## 19 6.7 2.35 58000
## 206 8.1 2.35 10000
Extract movie title, budget, and gross
pirates_data_cleaned<-subset(pirates_data, select=c(movie_title, budget, gross))
head(pirates_data_cleaned)
## movie_title budget gross
## 2 Pirates of the Caribbean: At World's End 3.00e+08 309404152
## 14 Pirates of the Caribbean: Dead Man's Chest 2.25e+08 423032628
## 19 Pirates of the Caribbean: On Stranger Tides 2.50e+08 241063875
## 206 Pirates of the Caribbean: The Curse of the Black Pearl 1.40e+08 305388685
pirates_data_cleaned<-data.frame(pirates_data_cleaned)
Melt this subset using the reshape library
pirates_melted <- melt(pirates_data_cleaned, id = 'movie_title')
head(pirates_melted)
## movie_title variable value
## 1 Pirates of the Caribbean: At World's End budget 300000000
## 2 Pirates of the Caribbean: Dead Man's Chest budget 225000000
## 3 Pirates of the Caribbean: On Stranger Tides budget 250000000
## 4 Pirates of the Caribbean: The Curse of the Black Pearl budget 140000000
## 5 Pirates of the Caribbean: At World's End gross 309404152
## 6 Pirates of the Caribbean: Dead Man's Chest gross 423032628
stacked bar plot for gross vs budget for each of Pirates of the Carribean Films
ggplot() + geom_bar(aes(y = value, x = movie_title, fill =variable ), data = pirates_melted,stat="identity")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
labs( x="Pirates Film", y="Amount in Dollars")
For each Pirates film, the gross was larger than the budget