read in the data downloaded from Kaggle
movie_data<-read.csv("~/Dropbox/movie_metadata.csv", header = TRUE, stringsAsFactors = FALSE)
Inspect the data
head(movie_data, 3)
inspect data structure
str(movie_data)
'data.frame': 5043 obs. of 28 variables:
$ color : chr "Color" "Color" "Color" "Color" ...
$ director_name : chr "James Cameron" "Gore Verbinski" "Sam Mendes" "Christopher Nolan" ...
$ num_critic_for_reviews : int 723 302 602 813 NA 462 392 324 635 375 ...
$ duration : int 178 169 148 164 NA 132 156 100 141 153 ...
$ director_facebook_likes : int 0 563 0 22000 131 475 0 15 0 282 ...
$ actor_3_facebook_likes : int 855 1000 161 23000 NA 530 4000 284 19000 10000 ...
$ actor_2_name : chr "Joel David Moore" "Orlando Bloom" "Rory Kinnear" "Christian Bale" ...
$ actor_1_facebook_likes : int 1000 40000 11000 27000 131 640 24000 799 26000 25000 ...
$ gross : int 760505847 309404152 200074175 448130642 NA 73058679 336530303 200807262 458991599 301956980 ...
$ genres : chr "Action|Adventure|Fantasy|Sci-Fi" "Action|Adventure|Fantasy" "Action|Adventure|Thriller" "Action|Thriller" ...
$ actor_1_name : chr "CCH Pounder" "Johnny Depp" "Christoph Waltz" "Tom Hardy" ...
$ movie_title : chr "Avatar " "Pirates of the Caribbean: At World's End " "Spectre " "The Dark Knight Rises " ...
$ num_voted_users : int 886204 471220 275868 1144337 8 212204 383056 294810 462669 321795 ...
$ cast_total_facebook_likes: int 4834 48350 11700 106759 143 1873 46055 2036 92000 58753 ...
$ actor_3_name : chr "Wes Studi" "Jack Davenport" "Stephanie Sigman" "Joseph Gordon-Levitt" ...
$ facenumber_in_poster : int 0 0 1 0 0 1 0 1 4 3 ...
$ plot_keywords : chr "avatar|future|marine|native|paraplegic" "goddess|marriage ceremony|marriage proposal|pirate|singapore" "bomb|espionage|sequel|spy|terrorist" "deception|imprisonment|lawlessness|police officer|terrorist plot" ...
$ movie_imdb_link : chr "http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1" ...
$ num_user_for_reviews : int 3054 1238 994 2701 NA 738 1902 387 1117 973 ...
$ language : chr "English" "English" "English" "English" ...
$ country : chr "USA" "USA" "UK" "USA" ...
$ content_rating : chr "PG-13" "PG-13" "PG-13" "PG-13" ...
$ budget : num 2.37e+08 3.00e+08 2.45e+08 2.50e+08 NA ...
$ title_year : int 2009 2007 2015 2012 NA 2012 2007 2010 2015 2009 ...
$ actor_2_facebook_likes : int 936 5000 393 23000 12 632 11000 553 21000 11000 ...
$ imdb_score : num 7.9 7.1 6.8 8.5 7.1 6.6 6.2 7.8 7.5 7.5 ...
$ aspect_ratio : num 1.78 2.35 2.35 2.35 NA 2.35 2.35 1.85 2.35 2.35 ...
$ movie_facebook_likes : int 33000 0 85000 164000 0 24000 0 29000 118000 10000 ...
subset variables needed for data analysis
movies<-movie_data[c(2,9,12,23,24,26)]
calculate profit for each movie
profit<-movies$gross-movies$budget
check results
summary(profit)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-1.221e+10 -1.027e+07 8.516e+05 5.845e+06 2.475e+07 5.235e+08 1152
add profit column to dataset
movies$profit=profit
check data frame with new profit column
head(movies, 3)
install sql library if needed and/or load the sql library will use sqldf package to manipulate dataframe
#install.packages("sqldf")
library("sqldf")
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
select the top 5 movies that made the most profit
top5Profit<-sqldf('SELECT movie_title, director_name, title_year, gross, budget, profit FROM movies ORDER BY profit DESC LIMIT 5')
Loading required package: tcltk
Quoted identifiers should have class SQL, use DBI::SQL() if the caller performs the quoting.
top5Profit
select the top 5 highest grossing movies
top5Gross<-sqldf('SELECT movie_title, director_name, title_year, gross, budget, profit FROM movies ORDER BY gross DESC LIMIT 5')
top5Gross
select the top 5 highest rated movies according to imdb
highestRated<-sqldf('SELECT movie_title, director_name, title_year, imdb_score, gross, budget, profit FROM movies ORDER BY imdb_score DESC LIMIT 5')
select the bottom 5 lowest rated movies according to imdb
lowestRated<-sqldf('SELECT movie_title, director_name, title_year, imdb_score, gross, budget, profit FROM movies ORDER BY imdb_score ASC LIMIT 5')
lowestRated
A scatterplot visualization of a movie’s rating and profit
plot(movies$imdb_score,movies$profit, main="Scatterplot of IMDB rating and Profit")
top grossing Directors
topDirectors<-sqldf('SELECT director_name, SUM(profit) as movie_profit, COUNT(movie_title) AS totalMovies, AVG(imdb_score) AS average_movie_rating FROM movies GROUP BY director_name ORDER BY movie_profit DESC LIMIT 10')
topDirectors
load visualization library
library(ggplot2)
plot top directors by sum of total movie profits
ggplot(topDirectors,
aes(y = topDirectors$movie_profit, x =
topDirectors$totalMovies)) +
geom_point(colour="white", size=2)+
scale_shape_manual(values=4)+
geom_text(aes(label=topDirectors$director_name), size=2) +
ggtitle("Top 10 profit making movies by director")+
xlab("Total number of movies directed")+
ylab("Total movie profits")
plot top directors by average imdb movie ratings
ggplot(topDirectors,
aes(y = topDirectors$movie_profit, x =
topDirectors$average_movie_rating)) +
geom_point(colour="white", size=2)+
geom_text(aes(label=topDirectors$director_name), size=2) +
ggtitle("Top money-making directors and their average IMDB movie ratings")+
xlab("Average IMDB ratings")+
ylab("Total movie profits")