- Install the following packages
install.packages("devtools")
library("devtools")
install_github("ndphillips/yarrr")
library("yarrr")
install.packages("wordcloud")
library("wordcloud")
install.packages("beanplot")
library("beanplot")
- View the dataset Movies
View(movies)
- See the names of the columns in the dataset
names(movies)
- See summary statistics of box office revenue
summary(movies$boxoffice.total)
- Show revenue of top n movies
n <- 20
plot(1,
xlab = "",
ylab = "Total box office revenue",
main = "Top Movie Revenues",
xaxt = "n",
xlim = c(1, n + 2),
ylim = c(min(movies$boxoffice.total), max(movies$boxoffice.total) * 1.1),
type = "n"
)
segments(1:n, rep(0, n), 1:n, movies$boxoffice.total[1:n], lty = 2)
points(1:n, movies$boxoffice.total[1:n],
pch = 16,
col = piratepal("info", length.out = n),
cex = 2
)
text(1:n, movies$boxoffice.total[1:n], movies$name[1:n], srt = 40, pos = 4)
- Create a histogram of the movie lengths
hist(movies$time, n = 20, ylim = c(0, 520))
abline(v = mean(movies$time, na.rm = T))
text(x = mean(movies$time, na.rm = T),
y = 510,
paste("mean = ", round(mean(movies$time, na.rm = T), 0)),
pos = 2)
- Which movie genres are the most popular? Create a wordcloud!
n.genre <- tapply(movies$boxoffice.total, movies$genre, length)
wordcloud(words = names(n.genre),
freq = n.genre,
col = piratepal("google"))
- Is there a relationship between movie genre and earnings?
median.earnings <- tapply(movies$boxoffice.total, movies$genre, median, na.rm = T)
n.genre <- tapply(movies$boxoffice.total, movies$genre, length)
n.points <- length(median.earnings)
plot(1:n.points,
median.earnings,
col = piratepal("info", length.out = n.points),
pch = 16, cex = log(n.genre),
main = "Movie Earnings by Genre",
xaxt = "n",
xlim = c(0, n.points + 1),
xlab = "",
ylab = "Median Earnings"
)
mtext("Size of points indicates how many movies are in that genre", side = 3, line = 0.5)
text(1:n.points, median.earnings, names(median.earnings))
- Do movie sequels earn more money than non-sequels? Do the following t-test!
sequel.ttest <- t.test(log(boxoffice.total) ~ sequel,
data = movies)
sequel.ttest
- Plot the distribution of revenues for sequels versus non-sequels!
beanplot(boxoffice.total ~ sequel,
data = movies,
col = piratepal("google", trans = .3)[1],
what = c(1, 1, 1, 0),
main = "Box office revenue of Sequels versus non-Sequels",
xaxt = "n", ylab = "Box Office Revenue"
)
mtext(c("Non-Sequel", "Sequel"), side = 1, at = c(1, 2), line = 1)
mtext(text = paste("t(",
round(sequel.ttest$parameter, 2), ") = ",
round(sequel.ttest$statistic, 2), ", p = ",
round(sequel.ttest$p.value, 2), sep = ""),
side = 3,
line = .3
)
- Is there a relationship between the time of a movie and its boxoffice revenue? Do a regression analysis!
model <- lm(log(boxoffice.total) ~ time,
data = subset(movies, time > 0))
summary(model)
- Plot the relationship between movie times and boxoffice revenue
movies.2 <- subset(movies, time > 0 & is.finite(time))
with(movies.2, plot(x = time,
y = log(boxoffice.total),
pch = 16,
col = gray(.1, .1),
main = "Movie times and Boxoffice Revenue",
ylab = "Revenue (log-transformed)",
xlab = "Time (in minutes)"
)
)
abline(model)
# Add names for n random movies
n <- 5
samp <- sample(1:nrow(movies.2), size = n)
points(x = movies.2$time[samp],
y = log(movies.2$boxoffice.total[samp]),
pch = 16,
col = piratepal("google", length.out = n))
text(x = movies.2$time[samp],
y = log(movies.2$boxoffice.total[samp]),
labels = movies.2$name[samp],
pos = 1)