36-721 Homework 2

Jerry Zhang

Due Thursday, September 12, 2013, 3:00 PM

library(ggplot2)
library(GGally)
## Loading required package: reshape Loading required package: plyr
## 
## Attaching package: 'reshape'
## 
## The following object is masked from 'package:plyr':
## 
## rename, round_any
library(gridExtra)
## Loading required package: grid
moviedata = read.csv("imdb-movies.csv")
names(moviedata) = c("X", "title", "year", "length", "rating", "votes", "mpaa", 
    "genre", "budgetrange")
attach(moviedata)

Exploratory Data Analysis

1) Summary statistics for all variable within data frame.

summary(moviedata)
##        X                             title          year     
##  Min.   :  145   102 Dalmatians         :  1   Min.   :1990  
##  1st Qu.:13403   13th Warrior, The      :  1   1st Qu.:1996  
##  Median :27852   2 Fast 2 Furious       :  1   Median :2000  
##  Mean   :28137   2001: A Space Travesty :  1   Mean   :1999  
##  3rd Qu.:43814   3000 Miles to Graceland:  1   3rd Qu.:2002  
##  Max.   :58787   51st State, The        :  1   Max.   :2004  
##                  (Other)                :597                 
##      length        rating         votes           mpaa    
##  Min.   : 70   Min.   :2.30   Min.   :  1020   PG   : 69  
##  1st Qu.: 92   1st Qu.:5.20   1st Qu.:  3414   PG-13:207  
##  Median :100   Median :5.90   Median :  7728   R    :216  
##  Mean   :106   Mean   :5.84   Mean   : 12923   UR   :111  
##  3rd Qu.:116   3rd Qu.:6.60   3rd Qu.: 16108              
##  Max.   :251   Max.   :9.00   Max.   :157608              
##                                                           
##           genre        budgetrange 
##  Action      :163   BigBudget:340  
##  ActionComedy: 80   Modest   :257  
##  ActionDrama :114   Tiny     :  6  
##  Comedy      :246                  
##                                    
##                                    
## 

2) Pairwise scatterplot and correlations for the numerical variables year, length, rating and votes.

ggpairs(moviedata[, c(3, 4, 5, 6)], legends = FALSE, title = "Pairwise Scatterplot And Correlations For Numerical Variables")

plot of chunk unnamed-chunk-3

3) Boxplot of movie ratings by budget.

moviedata$budgetrange = ordered(moviedata$budgetrange, levels = c("Tiny", "Modest", 
    "BigBudget"))
plot1 = ggplot(moviedata, aes(y = rating, x = budgetrange))
plot1 + geom_boxplot() + ggtitle("Boxpolot of Movie Ratings by Budget") + xlab("Budget ($)") + 
    ylab("IMDB Ratings") + theme_bw()

plot of chunk unnamed-chunk-4

Determining Relationships

1) length and genre: Action Drama were on average the longest, followed by Action and then Action Comedy. Comedies were the shortest type of films. The analysis could be strengthened if there was a drama category.

plot2 = ggplot(moviedata, aes(y = length, x = genre))
plot2 + geom_boxplot() + ggtitle("Boxplot of Movie Length by Genre") + ylab("Length (min)") + 
    geom_point(aes(colour = genre)) + theme_bw()

plot of chunk unnamed-chunk-5

2) length and rating: On average, longer movies tend to have higher ratings. Areas of greatest density are indicated by the 2D contour. Within this dataset, there are 6 movies longer than 3 hours. They are Bound by Honor, Gods and General, Heat, and 3 Lord of the Rings movies. All 6 movies are big budget action movies with fairly great reviews. Only Gods and General failed to achieve a 7.4 IMDB rating. Neither rating nor length are categorical. As mentioned above, adding an action genre would help the analysis. The individual scatter plot for each genre is also presented.

plot2 = ggplot(moviedata, aes(y = rating, x = length)) + geom_point() + ggtitle("Scatterplot of Ratings by Length") + 
    ylab("IMDB Ratings") + xlab("Length (min)") + geom_point() + theme_bw() + 
    geom_density2d() + geom_text(data = subset(moviedata, (length > 180)), aes(y = rating, 
    x = length, label = length))
plot3 = ggplot(subset(moviedata, moviedata$genre == "Action")[, c(4:5)], aes(x = length, 
    y = rating)) + stat_density2d(aes(fill = ..level..), geom = "polygon") + 
    geom_point(colour = "red", alpha = 0.4) + theme(legend.position = "none", 
    axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin = unit(c(0, 
        -0.5, -0.5, 0), "cm")) + ggtitle("Action")
plot4 = ggplot(subset(moviedata, moviedata$genre == "ActionComedy")[, c(4:5)], 
    aes(x = length, y = rating)) + stat_density2d(aes(fill = ..level..), geom = "polygon") + 
    geom_point(colour = "red", alpha = 0.4) + theme(legend.position = "none", 
    axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin = unit(c(0, 
        -0.5, -0.5, 0), "cm")) + ggtitle("Action Comedy")
plot5 = ggplot(subset(moviedata, moviedata$genre == "ActionDrama")[, c(4:5)], 
    aes(x = length, y = rating)) + stat_density2d(aes(fill = ..level..), geom = "polygon") + 
    geom_point(colour = "red", alpha = 0.4) + theme(legend.position = "none", 
    axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin = unit(c(0, 
        -0.5, -0.5, 0), "cm")) + ggtitle("Action Drama")
plot6 = ggplot(subset(moviedata, moviedata$genre == "Comedy")[, c(4:5)], aes(x = length, 
    y = rating)) + stat_density2d(aes(fill = ..level..), geom = "polygon") + 
    geom_point(colour = "red", alpha = 0.4) + theme(legend.position = "none", 
    axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin = unit(c(0, 
        -0.5, -0.5, 0), "cm")) + ggtitle("Comedy")

grid.arrange(plot2, arrangeGrob(plot3, plot4, plot5, plot6, nrow = 1), nrow = 2)

plot of chunk unnamed-chunk-6

3) votes and rating: Points were plotted based on the size of their studentized residuals. There were quite a few highly rated movies not seen by many views. Of course, there are many, low production value, presumably poorly written movies without many IMDB votes. There are two exceptional movies, The Matrix and LOTR: The Fellowship of the Ring, that were able to garner both high votes and ratings. Star Wars: Episode I generated solid IMDB ratings with a sizable number of votes. Neither rating nor votes are categorical, but it might be interesting to color the points by a categorical Metacritic score, a composite of movie critic ratings.

lmrv = lm(rating ~ votes)
moviedata[, 10] = abs(rstandard(lmrv))
plot7 = ggplot(moviedata, aes(y = rating, x = votes))
plot7 + geom_point(size = abs(rstandard(lmrv))) + geom_smooth(method = lm, se = FALSE, 
    colour = "green", linetype = "dashed") + ggtitle("Rating versus Votes") + 
    xlab("Votes") + ylab("IMDB Rating") + geom_density2d(colour = "brown", alpha = 0.5) + 
    geom_text(data = subset(moviedata, (V10 > 2) & (votes > 50000)), aes(x = votes, 
        y = rating, label = title))

plot of chunk unnamed-chunk-7

4) year and budgetrange: Within our sample, there are movies with big and modest budget just about every year. More and more big budget films are being made. However, there is an outlier within the Tiny budget range. The outlier is the movie Napoleon Dynamite. The Tiny budget could be attributed to the fact that Napoleon Dynamite was the first full length movie written and directed by Jared Hess. Breaking the budget down into numerical bins might be yield more meaningful analysis. As it currently stands, there is no way to differentiate within a budget category.

plot8 = ggplot(moviedata, aes(y = budgetrange, x = year)) + ggtitle("Budget versus Year") + 
    ylab("Budget Range") + xlab("Year") + geom_jitter(position = position_jitter(width = 0.2, 
    height = 0.05)) + geom_text(data = subset(moviedata, (budgetrange == "Tiny") & 
    (year > 2000)), aes(y = budgetrange, x = year, label = title), hjust = 1, 
    vjust = 0.2)

densitydata1 = data.frame(x = subset(year, budgetrange == "Tiny"))
densitydata2 = data.frame(x = subset(year, budgetrange == "Modest"))
densitydata3 = data.frame(x = subset(year, budgetrange == "BigBudget"))
plot9 = ggplot() + stat_density(data = densitydata1, aes(x = x), fill = "red", 
    alpha = 0.6) + stat_density(data = densitydata2, aes(x = x), fill = "Green", 
    alpha = 0.5) + stat_density(data = densitydata3, aes(x = x), fill = "Blue", 
    alpha = 0.5) + ggtitle("Density Estimates of Budget by Year (Tiny = Red, Modest = Green, Big Budget = Blue)") + 
    xlab("Year") + ylab("Density")
grid.arrange(plot8, plot9, nrow = 2)

plot of chunk unnamed-chunk-8