Dataset

https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset

read in the data downloaded from Kaggle

movie_data<-read.csv("~/Dropbox/movie_metadata.csv", header = TRUE, stringsAsFactors = FALSE)

Inspect the data

head(movie_data, 3)

inspect data structure

str(movie_data)
'data.frame':   5043 obs. of  28 variables:
 $ color                    : chr  "Color" "Color" "Color" "Color" ...
 $ director_name            : chr  "James Cameron" "Gore Verbinski" "Sam Mendes" "Christopher Nolan" ...
 $ num_critic_for_reviews   : int  723 302 602 813 NA 462 392 324 635 375 ...
 $ duration                 : int  178 169 148 164 NA 132 156 100 141 153 ...
 $ director_facebook_likes  : int  0 563 0 22000 131 475 0 15 0 282 ...
 $ actor_3_facebook_likes   : int  855 1000 161 23000 NA 530 4000 284 19000 10000 ...
 $ actor_2_name             : chr  "Joel David Moore" "Orlando Bloom" "Rory Kinnear" "Christian Bale" ...
 $ actor_1_facebook_likes   : int  1000 40000 11000 27000 131 640 24000 799 26000 25000 ...
 $ gross                    : int  760505847 309404152 200074175 448130642 NA 73058679 336530303 200807262 458991599 301956980 ...
 $ genres                   : chr  "Action|Adventure|Fantasy|Sci-Fi" "Action|Adventure|Fantasy" "Action|Adventure|Thriller" "Action|Thriller" ...
 $ actor_1_name             : chr  "CCH Pounder" "Johnny Depp" "Christoph Waltz" "Tom Hardy" ...
 $ movie_title              : chr  "Avatar " "Pirates of the Caribbean: At World's End " "Spectre " "The Dark Knight Rises " ...
 $ num_voted_users          : int  886204 471220 275868 1144337 8 212204 383056 294810 462669 321795 ...
 $ cast_total_facebook_likes: int  4834 48350 11700 106759 143 1873 46055 2036 92000 58753 ...
 $ actor_3_name             : chr  "Wes Studi" "Jack Davenport" "Stephanie Sigman" "Joseph Gordon-Levitt" ...
 $ facenumber_in_poster     : int  0 0 1 0 0 1 0 1 4 3 ...
 $ plot_keywords            : chr  "avatar|future|marine|native|paraplegic" "goddess|marriage ceremony|marriage proposal|pirate|singapore" "bomb|espionage|sequel|spy|terrorist" "deception|imprisonment|lawlessness|police officer|terrorist plot" ...
 $ movie_imdb_link          : chr  "http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1" ...
 $ num_user_for_reviews     : int  3054 1238 994 2701 NA 738 1902 387 1117 973 ...
 $ language                 : chr  "English" "English" "English" "English" ...
 $ country                  : chr  "USA" "USA" "UK" "USA" ...
 $ content_rating           : chr  "PG-13" "PG-13" "PG-13" "PG-13" ...
 $ budget                   : num  2.37e+08 3.00e+08 2.45e+08 2.50e+08 NA ...
 $ title_year               : int  2009 2007 2015 2012 NA 2012 2007 2010 2015 2009 ...
 $ actor_2_facebook_likes   : int  936 5000 393 23000 12 632 11000 553 21000 11000 ...
 $ imdb_score               : num  7.9 7.1 6.8 8.5 7.1 6.6 6.2 7.8 7.5 7.5 ...
 $ aspect_ratio             : num  1.78 2.35 2.35 2.35 NA 2.35 2.35 1.85 2.35 2.35 ...
 $ movie_facebook_likes     : int  33000 0 85000 164000 0 24000 0 29000 118000 10000 ...

subset variables needed for data analysis

movies<-movie_data[c(2,9,12,23,24,26)]

calculate profit for each movie

profit<-movies$gross-movies$budget

check results

summary(profit)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max.       NA's 
-1.221e+10 -1.027e+07  8.516e+05  5.845e+06  2.475e+07  5.235e+08       1152 

add profit column to dataset

movies$profit=profit

check data frame with new profit column

head(movies, 3)

install sql library if needed and/or load the sql library will use sqldf package to manipulate dataframe

#install.packages("sqldf")
library("sqldf")
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite

select the top 5 movies that made the most profit

top5Profit<-sqldf('SELECT movie_title, director_name, title_year, gross, budget, profit FROM movies ORDER BY profit DESC LIMIT 5')
Loading required package: tcltk
Quoted identifiers should have class SQL, use DBI::SQL() if the caller performs the quoting.
top5Profit

select the top 5 highest grossing movies

top5Gross<-sqldf('SELECT movie_title, director_name, title_year, gross, budget, profit FROM movies ORDER BY gross DESC LIMIT 5')
top5Gross

select the top 5 highest rated movies according to imdb

highestRated<-sqldf('SELECT movie_title, director_name, title_year, imdb_score, gross, budget, profit FROM movies ORDER BY imdb_score DESC LIMIT 5')

select the bottom 5 lowest rated movies according to imdb

lowestRated<-sqldf('SELECT movie_title, director_name, title_year, imdb_score, gross, budget, profit FROM movies ORDER BY imdb_score ASC LIMIT 5')
lowestRated

A scatterplot visualization of a movie’s rating and profit

plot(movies$imdb_score,movies$profit, main="Scatterplot of IMDB rating and Profit")

top grossing Directors

topDirectors<-sqldf('SELECT director_name, SUM(profit) as movie_profit, COUNT(movie_title) AS totalMovies, AVG(imdb_score) AS average_movie_rating FROM movies GROUP BY director_name ORDER BY movie_profit DESC LIMIT 10')
topDirectors

load visualization library

library(ggplot2)

plot top directors by sum of total movie profits

ggplot(topDirectors, 
       aes(y = topDirectors$movie_profit, x =
            topDirectors$totalMovies)) +
            geom_point(colour="white", size=2)+
            scale_shape_manual(values=4)+
             geom_text(aes(label=topDirectors$director_name), size=2) +
          ggtitle("Top 10 profit making movies by director")+
            xlab("Total number of movies directed")+
            ylab("Total movie profits")

plot top directors by average imdb movie ratings

ggplot(topDirectors, 
       aes(y = topDirectors$movie_profit, x =
            topDirectors$average_movie_rating)) +
            geom_point(colour="white", size=2)+
             geom_text(aes(label=topDirectors$director_name), size=2) +
  ggtitle("Top money-making directors and their average IMDB movie ratings")+
            xlab("Average IMDB ratings")+
            ylab("Total movie profits")

LS0tCnRpdGxlOiAiTW9uZXktbWFraW5nIG1vdmllIGRpcmVjdG9ycyBhbmQgdGhlaXIgbW92aWVzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCiNEYXRhc2V0CiNodHRwczovL3d3dy5rYWdnbGUuY29tL2RlZXBtYXRyaXgvaW1kYi01MDAwLW1vdmllLWRhdGFzZXQKCnJlYWQgaW4gdGhlIGRhdGEKZG93bmxvYWRlZCBmcm9tIEthZ2dsZQpgYGB7cn0KbW92aWVfZGF0YTwtcmVhZC5jc3YoIn4vRHJvcGJveC9tb3ZpZV9tZXRhZGF0YS5jc3YiLCBoZWFkZXIgPSBUUlVFLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpCmBgYApJbnNwZWN0IHRoZSBkYXRhCmBgYHtyfQpoZWFkKG1vdmllX2RhdGEsIDMpCmBgYAppbnNwZWN0IGRhdGEgc3RydWN0dXJlCmBgYHtyfQpzdHIobW92aWVfZGF0YSkKYGBgCnN1YnNldCB2YXJpYWJsZXMgbmVlZGVkIGZvciBkYXRhIGFuYWx5c2lzCmBgYHtyfQptb3ZpZXM8LW1vdmllX2RhdGFbYygyLDksMTIsMjMsMjQsMjYpXQpgYGAKY2FsY3VsYXRlIHByb2ZpdCBmb3IgZWFjaCBtb3ZpZQpgYGB7cn0KcHJvZml0PC1tb3ZpZXMkZ3Jvc3MtbW92aWVzJGJ1ZGdldApgYGAKY2hlY2sgcmVzdWx0cwpgYGB7cn0Kc3VtbWFyeShwcm9maXQpCmBgYAphZGQgcHJvZml0IGNvbHVtbiB0byBkYXRhc2V0CmBgYHtyfQptb3ZpZXMkcHJvZml0PXByb2ZpdApgYGAKY2hlY2sgZGF0YSBmcmFtZSB3aXRoIG5ldyBwcm9maXQgY29sdW1uCmBgYHtyfQpoZWFkKG1vdmllcywgMykKYGBgCmluc3RhbGwgc3FsIGxpYnJhcnkgaWYgbmVlZGVkIGFuZC9vciBsb2FkIHRoZSBzcWwgbGlicmFyeQp3aWxsIHVzZSBzcWxkZiBwYWNrYWdlIHRvIG1hbmlwdWxhdGUgZGF0YWZyYW1lCmBgYHtyfQojaW5zdGFsbC5wYWNrYWdlcygic3FsZGYiKQpsaWJyYXJ5KCJzcWxkZiIpCmBgYApzZWxlY3QgdGhlIHRvcCA1IG1vdmllcyB0aGF0IG1hZGUgdGhlIG1vc3QgcHJvZml0CmBgYHtyfQp0b3A1UHJvZml0PC1zcWxkZignU0VMRUNUIG1vdmllX3RpdGxlLCBkaXJlY3Rvcl9uYW1lLCB0aXRsZV95ZWFyLCBncm9zcywgYnVkZ2V0LCBwcm9maXQgRlJPTSBtb3ZpZXMgT1JERVIgQlkgcHJvZml0IERFU0MgTElNSVQgNScpCgp0b3A1UHJvZml0CmBgYApzZWxlY3QgdGhlIHRvcCA1IGhpZ2hlc3QgZ3Jvc3NpbmcgbW92aWVzCmBgYHtyfQp0b3A1R3Jvc3M8LXNxbGRmKCdTRUxFQ1QgbW92aWVfdGl0bGUsIGRpcmVjdG9yX25hbWUsIHRpdGxlX3llYXIsIGdyb3NzLCBidWRnZXQsIHByb2ZpdCBGUk9NIG1vdmllcyBPUkRFUiBCWSBncm9zcyBERVNDIExJTUlUIDUnKQoKdG9wNUdyb3NzCmBgYApzZWxlY3QgdGhlIHRvcCA1IGhpZ2hlc3QgcmF0ZWQgbW92aWVzIGFjY29yZGluZyB0byBpbWRiCmBgYHtyfQpoaWdoZXN0UmF0ZWQ8LXNxbGRmKCdTRUxFQ1QgbW92aWVfdGl0bGUsIGRpcmVjdG9yX25hbWUsIHRpdGxlX3llYXIsIGltZGJfc2NvcmUsIGdyb3NzLCBidWRnZXQsIHByb2ZpdCBGUk9NIG1vdmllcyBPUkRFUiBCWSBpbWRiX3Njb3JlIERFU0MgTElNSVQgNScpCmBgYApzZWxlY3QgdGhlIGJvdHRvbSA1IGxvd2VzdCByYXRlZCBtb3ZpZXMgYWNjb3JkaW5nIHRvIGltZGIKYGBge3J9Cmxvd2VzdFJhdGVkPC1zcWxkZignU0VMRUNUIG1vdmllX3RpdGxlLCBkaXJlY3Rvcl9uYW1lLCB0aXRsZV95ZWFyLCBpbWRiX3Njb3JlLCBncm9zcywgYnVkZ2V0LCBwcm9maXQgRlJPTSBtb3ZpZXMgT1JERVIgQlkgaW1kYl9zY29yZSBBU0MgTElNSVQgNScpCgpsb3dlc3RSYXRlZApgYGAKQSBzY2F0dGVycGxvdCB2aXN1YWxpemF0aW9uIG9mIGEgbW92aWUncyByYXRpbmcgYW5kIHByb2ZpdAoKYGBge3J9CnBsb3QobW92aWVzJGltZGJfc2NvcmUsbW92aWVzJHByb2ZpdCwgbWFpbj0iU2NhdHRlcnBsb3Qgb2YgSU1EQiByYXRpbmcgYW5kIFByb2ZpdCIpCmBgYAp0b3AgZ3Jvc3NpbmcgRGlyZWN0b3JzCmBgYHtyfQp0b3BEaXJlY3RvcnM8LXNxbGRmKCdTRUxFQ1QgZGlyZWN0b3JfbmFtZSwgU1VNKHByb2ZpdCkgYXMgbW92aWVfcHJvZml0LCBDT1VOVChtb3ZpZV90aXRsZSkgQVMgdG90YWxNb3ZpZXMsIEFWRyhpbWRiX3Njb3JlKSBBUyBhdmVyYWdlX21vdmllX3JhdGluZyBGUk9NIG1vdmllcyBHUk9VUCBCWSBkaXJlY3Rvcl9uYW1lIE9SREVSIEJZIG1vdmllX3Byb2ZpdCBERVNDIExJTUlUIDEwJykKdG9wRGlyZWN0b3JzCmBgYApsb2FkIHZpc3VhbGl6YXRpb24gbGlicmFyeQpgYGB7cn0KbGlicmFyeShnZ3Bsb3QyKQpgYGAKcGxvdCB0b3AgZGlyZWN0b3JzIGJ5IHN1bSBvZiB0b3RhbCBtb3ZpZSBwcm9maXRzCmBgYHtyfQpnZ3Bsb3QodG9wRGlyZWN0b3JzLCAKICAgICAgIGFlcyh5ID0gdG9wRGlyZWN0b3JzJG1vdmllX3Byb2ZpdCwgeCA9CiAgICAgICAgICAgIHRvcERpcmVjdG9ycyR0b3RhbE1vdmllcykpICsKICAgICAgICAgICAgZ2VvbV9wb2ludChjb2xvdXI9IndoaXRlIiwgc2l6ZT0yKSsKICAgICAgICAgICAgc2NhbGVfc2hhcGVfbWFudWFsKHZhbHVlcz00KSsKICAgICAgICAgICAgIGdlb21fdGV4dChhZXMobGFiZWw9dG9wRGlyZWN0b3JzJGRpcmVjdG9yX25hbWUpLCBzaXplPTIpICsKICAgICAgICAgIGdndGl0bGUoIlRvcCAxMCBwcm9maXQgbWFraW5nIG1vdmllcyBieSBkaXJlY3RvciIpKwogICAgICAgICAgICB4bGFiKCJUb3RhbCBudW1iZXIgb2YgbW92aWVzIGRpcmVjdGVkIikrCiAgICAgICAgICAgIHlsYWIoIlRvdGFsIG1vdmllIHByb2ZpdHMiKQpgYGAKcGxvdCB0b3AgZGlyZWN0b3JzIGJ5IGF2ZXJhZ2UgaW1kYiBtb3ZpZSByYXRpbmdzCmBgYHtyfQpnZ3Bsb3QodG9wRGlyZWN0b3JzLCAKICAgICAgIGFlcyh5ID0gdG9wRGlyZWN0b3JzJG1vdmllX3Byb2ZpdCwgeCA9CiAgICAgICAgICAgIHRvcERpcmVjdG9ycyRhdmVyYWdlX21vdmllX3JhdGluZykpICsKICAgICAgICAgICAgZ2VvbV9wb2ludChjb2xvdXI9IndoaXRlIiwgc2l6ZT0yKSsKICAgICAgICAgICAgIGdlb21fdGV4dChhZXMobGFiZWw9dG9wRGlyZWN0b3JzJGRpcmVjdG9yX25hbWUpLCBzaXplPTIpICsKICBnZ3RpdGxlKCJUb3AgbW9uZXktbWFraW5nIGRpcmVjdG9ycyBhbmQgdGhlaXIgYXZlcmFnZSBJTURCIG1vdmllIHJhdGluZ3MiKSsKICAgICAgICAgICAgeGxhYigiQXZlcmFnZSBJTURCIHJhdGluZ3MiKSsKICAgICAgICAgICAgeWxhYigiVG90YWwgbW92aWUgcHJvZml0cyIpCgpgYGAKCg==