R Class Day 1

Install the following packages

install.packages("devtools")
library("devtools")
install_github("ndphillips/yarrr")
library("yarrr")
install.packages("wordcloud")
library("wordcloud")
install.packages("beanplot")
library("beanplot")

View the dataset Movies

View(movies)

See the names of the columns in the dataset

names(movies)

See summary statistics of box office revenue

summary(movies$boxoffice.total)

Show revenue of top n movies

n <- 20

plot(1,
     xlab = "", 
     ylab = "Total box office revenue",
     main = "Top Movie Revenues",
     xaxt = "n",
     xlim = c(1, n + 2),
     ylim = c(min(movies$boxoffice.total), max(movies$boxoffice.total) * 1.1),
     type = "n"
     )

segments(1:n, rep(0, n), 1:n, movies$boxoffice.total[1:n], lty = 2)

points(1:n, movies$boxoffice.total[1:n], 
     pch = 16,
     col = piratepal("info", length.out = n),
     cex = 2
     )

text(1:n, movies$boxoffice.total[1:n], movies$name[1:n], srt = 40, pos = 4)

Create a histogram of the movie lengths

hist(movies$time, n = 20, ylim = c(0, 520))
abline(v = mean(movies$time, na.rm = T))

text(x = mean(movies$time, na.rm = T), 
     y = 510, 
     paste("mean = ", round(mean(movies$time, na.rm = T), 0)), 
     pos = 2)

Which movie genres are the most popular? Create a wordcloud!

n.genre <- tapply(movies$boxoffice.total, movies$genre, length)

wordcloud(words = names(n.genre), 
          freq = n.genre, 
          col = piratepal("google"))

Is there a relationship between movie genre and earnings?

median.earnings <- tapply(movies$boxoffice.total, movies$genre, median, na.rm = T)
n.genre <- tapply(movies$boxoffice.total, movies$genre, length)
n.points <- length(median.earnings)

plot(1:n.points, 
     median.earnings,
     col = piratepal("info", length.out = n.points),
     pch = 16, cex = log(n.genre),
     main = "Movie Earnings by Genre",
     xaxt = "n",
     xlim = c(0, n.points + 1),
     xlab = "",
     ylab = "Median Earnings"
     )

mtext("Size of points indicates how many movies are in that genre", side = 3, line = 0.5)

text(1:n.points, median.earnings, names(median.earnings))

Do movie sequels earn more money than non-sequels? Do the following t-test!

sequel.ttest <- t.test(log(boxoffice.total) ~ sequel, 
       data = movies)

sequel.ttest

Plot the distribution of revenues for sequels versus non-sequels!

beanplot(boxoffice.total ~ sequel, 
       data = movies, 
       col = piratepal("google", trans = .3)[1],
       what = c(1, 1, 1, 0),
       main = "Box office revenue of Sequels versus non-Sequels",
       xaxt = "n", ylab = "Box Office Revenue"
       )

mtext(c("Non-Sequel", "Sequel"), side = 1, at = c(1, 2), line = 1)

mtext(text = paste("t(", 
            round(sequel.ttest$parameter, 2), ") = ", 
            round(sequel.ttest$statistic, 2), ", p = ",
            round(sequel.ttest$p.value, 2), sep = ""), 
      side = 3,
      line = .3
      )

Is there a relationship between the time of a movie and its boxoffice revenue? Do a regression analysis!

model <- lm(log(boxoffice.total) ~ time, 
       data = subset(movies, time > 0))

summary(model)

Plot the relationship between movie times and boxoffice revenue

movies.2 <- subset(movies, time > 0 & is.finite(time))

with(movies.2, plot(x = time, 
                    y = log(boxoffice.total), 
                    pch = 16, 
                    col = gray(.1, .1),
                    main = "Movie times and Boxoffice Revenue",
                    ylab = "Revenue (log-transformed)",
                    xlab = "Time (in minutes)"
                    )
     )

abline(model)

# Add names for n random movies

n <- 5
samp <- sample(1:nrow(movies.2), size = n)

points(x = movies.2$time[samp], 
       y = log(movies.2$boxoffice.total[samp]), 
       pch = 16, 
       col = piratepal("google", length.out = n))

text(x = movies.2$time[samp], 
     y = log(movies.2$boxoffice.total[samp]), 
     labels = movies.2$name[samp], 
     pos = 1)

R Class Day 1

Nathaniel Phillips

21 Oct 2015