library(ggplot2)
library(GGally)
## Loading required package: reshape Loading required package: plyr
##
## Attaching package: 'reshape'
##
## The following object is masked from 'package:plyr':
##
## rename, round_any
library(gridExtra)
## Loading required package: grid
moviedata = read.csv("imdb-movies.csv")
names(moviedata) = c("X", "title", "year", "length", "rating", "votes", "mpaa",
"genre", "budgetrange")
attach(moviedata)
1) Summary statistics for all variable within data frame.
summary(moviedata)
## X title year
## Min. : 145 102 Dalmatians : 1 Min. :1990
## 1st Qu.:13403 13th Warrior, The : 1 1st Qu.:1996
## Median :27852 2 Fast 2 Furious : 1 Median :2000
## Mean :28137 2001: A Space Travesty : 1 Mean :1999
## 3rd Qu.:43814 3000 Miles to Graceland: 1 3rd Qu.:2002
## Max. :58787 51st State, The : 1 Max. :2004
## (Other) :597
## length rating votes mpaa
## Min. : 70 Min. :2.30 Min. : 1020 PG : 69
## 1st Qu.: 92 1st Qu.:5.20 1st Qu.: 3414 PG-13:207
## Median :100 Median :5.90 Median : 7728 R :216
## Mean :106 Mean :5.84 Mean : 12923 UR :111
## 3rd Qu.:116 3rd Qu.:6.60 3rd Qu.: 16108
## Max. :251 Max. :9.00 Max. :157608
##
## genre budgetrange
## Action :163 BigBudget:340
## ActionComedy: 80 Modest :257
## ActionDrama :114 Tiny : 6
## Comedy :246
##
##
##
2) Pairwise scatterplot and correlations for the numerical variables year, length, rating and votes.
ggpairs(moviedata[, c(3, 4, 5, 6)], legends = FALSE, title = "Pairwise Scatterplot And Correlations For Numerical Variables")
3) Boxplot of movie ratings by budget.
moviedata$budgetrange = ordered(moviedata$budgetrange, levels = c("Tiny", "Modest",
"BigBudget"))
plot1 = ggplot(moviedata, aes(y = rating, x = budgetrange))
plot1 + geom_boxplot() + ggtitle("Boxpolot of Movie Ratings by Budget") + xlab("Budget ($)") +
ylab("IMDB Ratings") + theme_bw()
1) length and genre: Action Drama were on average the longest, followed by Action and then Action Comedy. Comedies were the shortest type of films. The analysis could be strengthened if there was a drama category.
plot2 = ggplot(moviedata, aes(y = length, x = genre))
plot2 + geom_boxplot() + ggtitle("Boxplot of Movie Length by Genre") + ylab("Length (min)") +
geom_point(aes(colour = genre)) + theme_bw()
2) length and rating: On average, longer movies tend to have higher ratings. Areas of greatest density are indicated by the 2D contour. Within this dataset, there are 6 movies longer than 3 hours. They are Bound by Honor, Gods and General, Heat, and 3 Lord of the Rings movies. All 6 movies are big budget action movies with fairly great reviews. Only Gods and General failed to achieve a 7.4 IMDB rating. Neither rating nor length are categorical. As mentioned above, adding an action genre would help the analysis. The individual scatter plot for each genre is also presented.
plot2 = ggplot(moviedata, aes(y = rating, x = length)) + geom_point() + ggtitle("Scatterplot of Ratings by Length") +
ylab("IMDB Ratings") + xlab("Length (min)") + geom_point() + theme_bw() +
geom_density2d() + geom_text(data = subset(moviedata, (length > 180)), aes(y = rating,
x = length, label = length))
plot3 = ggplot(subset(moviedata, moviedata$genre == "Action")[, c(4:5)], aes(x = length,
y = rating)) + stat_density2d(aes(fill = ..level..), geom = "polygon") +
geom_point(colour = "red", alpha = 0.4) + theme(legend.position = "none",
axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin = unit(c(0,
-0.5, -0.5, 0), "cm")) + ggtitle("Action")
plot4 = ggplot(subset(moviedata, moviedata$genre == "ActionComedy")[, c(4:5)],
aes(x = length, y = rating)) + stat_density2d(aes(fill = ..level..), geom = "polygon") +
geom_point(colour = "red", alpha = 0.4) + theme(legend.position = "none",
axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin = unit(c(0,
-0.5, -0.5, 0), "cm")) + ggtitle("Action Comedy")
plot5 = ggplot(subset(moviedata, moviedata$genre == "ActionDrama")[, c(4:5)],
aes(x = length, y = rating)) + stat_density2d(aes(fill = ..level..), geom = "polygon") +
geom_point(colour = "red", alpha = 0.4) + theme(legend.position = "none",
axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin = unit(c(0,
-0.5, -0.5, 0), "cm")) + ggtitle("Action Drama")
plot6 = ggplot(subset(moviedata, moviedata$genre == "Comedy")[, c(4:5)], aes(x = length,
y = rating)) + stat_density2d(aes(fill = ..level..), geom = "polygon") +
geom_point(colour = "red", alpha = 0.4) + theme(legend.position = "none",
axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin = unit(c(0,
-0.5, -0.5, 0), "cm")) + ggtitle("Comedy")
grid.arrange(plot2, arrangeGrob(plot3, plot4, plot5, plot6, nrow = 1), nrow = 2)
3) votes and rating: Points were plotted based on the size of their studentized residuals. There were quite a few highly rated movies not seen by many views. Of course, there are many, low production value, presumably poorly written movies without many IMDB votes. There are two exceptional movies, The Matrix and LOTR: The Fellowship of the Ring, that were able to garner both high votes and ratings. Star Wars: Episode I generated solid IMDB ratings with a sizable number of votes. Neither rating nor votes are categorical, but it might be interesting to color the points by a categorical Metacritic score, a composite of movie critic ratings.
lmrv = lm(rating ~ votes)
moviedata[, 10] = abs(rstandard(lmrv))
plot7 = ggplot(moviedata, aes(y = rating, x = votes))
plot7 + geom_point(size = abs(rstandard(lmrv))) + geom_smooth(method = lm, se = FALSE,
colour = "green", linetype = "dashed") + ggtitle("Rating versus Votes") +
xlab("Votes") + ylab("IMDB Rating") + geom_density2d(colour = "brown", alpha = 0.5) +
geom_text(data = subset(moviedata, (V10 > 2) & (votes > 50000)), aes(x = votes,
y = rating, label = title))
4) year and budgetrange: Within our sample, there are movies with big and modest budget just about every year. More and more big budget films are being made. However, there is an outlier within the Tiny budget range. The outlier is the movie Napoleon Dynamite. The Tiny budget could be attributed to the fact that Napoleon Dynamite was the first full length movie written and directed by Jared Hess. Breaking the budget down into numerical bins might be yield more meaningful analysis. As it currently stands, there is no way to differentiate within a budget category.
plot8 = ggplot(moviedata, aes(y = budgetrange, x = year)) + ggtitle("Budget versus Year") +
ylab("Budget Range") + xlab("Year") + geom_jitter(position = position_jitter(width = 0.2,
height = 0.05)) + geom_text(data = subset(moviedata, (budgetrange == "Tiny") &
(year > 2000)), aes(y = budgetrange, x = year, label = title), hjust = 1,
vjust = 0.2)
densitydata1 = data.frame(x = subset(year, budgetrange == "Tiny"))
densitydata2 = data.frame(x = subset(year, budgetrange == "Modest"))
densitydata3 = data.frame(x = subset(year, budgetrange == "BigBudget"))
plot9 = ggplot() + stat_density(data = densitydata1, aes(x = x), fill = "red",
alpha = 0.6) + stat_density(data = densitydata2, aes(x = x), fill = "Green",
alpha = 0.5) + stat_density(data = densitydata3, aes(x = x), fill = "Blue",
alpha = 0.5) + ggtitle("Density Estimates of Budget by Year (Tiny = Red, Modest = Green, Big Budget = Blue)") +
xlab("Year") + ylab("Density")
grid.arrange(plot8, plot9, nrow = 2)